From 0ef5e33e6538497ead2aa30919fd2df3d9338112 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Fri, 24 Oct 2025 15:16:11 +0200 Subject: [PATCH 01/81] first checkpoint --- src/zenml/enums.py | 1 + src/zenml/log_stores/__init__.py | 31 +++++++++++++++++++ src/zenml/stack/flavor_registry.py | 3 ++ src/zenml/stack/stack.py | 20 ++++++++++++ .../zen_stores/schemas/component_schemas.py | 5 --- src/zenml/zen_stores/schemas/logs_schemas.py | 20 +++++++----- 6 files changed, 68 insertions(+), 12 deletions(-) create mode 100644 src/zenml/log_stores/__init__.py diff --git a/src/zenml/enums.py b/src/zenml/enums.py index 93edbba278f..1e86964d557 100644 --- a/src/zenml/enums.py +++ b/src/zenml/enums.py @@ -157,6 +157,7 @@ class StackComponentType(StrEnum): EXPERIMENT_TRACKER = "experiment_tracker" FEATURE_STORE = "feature_store" IMAGE_BUILDER = "image_builder" + LOG_STORE = "log_store" MODEL_DEPLOYER = "model_deployer" ORCHESTRATOR = "orchestrator" STEP_OPERATOR = "step_operator" diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py new file mode 100644 index 00000000000..a520ebd4803 --- /dev/null +++ b/src/zenml/log_stores/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Log stores allow you to collect and store logs from pipeline runs. + +ZenML log stores provide different backends for storing pipeline and step logs. +""" + +from zenml.log_stores.base_log_store import BaseLogStore, BaseLogStoreConfig +from zenml.log_stores.datadog_flavor import DatadogLogStoreFlavor +from zenml.log_stores.otel_flavor import OtelLogStoreFlavor +from zenml.log_stores.utils import fetch_logs + +__all__ = [ + "BaseLogStore", + "BaseLogStoreConfig", + "DatadogLogStoreFlavor", + "OtelLogStoreFlavor", + "fetch_logs", +] + diff --git a/src/zenml/stack/flavor_registry.py b/src/zenml/stack/flavor_registry.py index 4a1237c8171..4c1fd139269 100644 --- a/src/zenml/stack/flavor_registry.py +++ b/src/zenml/stack/flavor_registry.py @@ -69,6 +69,7 @@ def builtin_flavors(self) -> List[Type[Flavor]]: ) from zenml.deployers import DockerDeployerFlavor from zenml.image_builders import LocalImageBuilderFlavor + from zenml.log_stores import DatadogLogStoreFlavor, OtelLogStoreFlavor from zenml.orchestrators import ( LocalDockerOrchestratorFlavor, LocalOrchestratorFlavor, @@ -85,6 +86,8 @@ def builtin_flavors(self) -> List[Type[Flavor]]: GitHubContainerRegistryFlavor, LocalImageBuilderFlavor, DockerDeployerFlavor, + OtelLogStoreFlavor, + DatadogLogStoreFlavor, ] return flavors diff --git a/src/zenml/stack/stack.py b/src/zenml/stack/stack.py index 4651b50c9ca..902471fb902 100644 --- a/src/zenml/stack/stack.py +++ b/src/zenml/stack/stack.py @@ -63,6 +63,7 @@ ) from zenml.feature_stores import BaseFeatureStore from zenml.image_builders import BaseImageBuilder + from zenml.log_stores import BaseLogStore from zenml.model_deployers import BaseModelDeployer from zenml.model_registries import BaseModelRegistry from zenml.models import ( @@ -112,6 +113,7 @@ def __init__( image_builder: Optional["BaseImageBuilder"] = None, model_registry: Optional["BaseModelRegistry"] = None, deployer: Optional["BaseDeployer"] = None, + log_store: Optional["BaseLogStore"] = None, ): """Initializes and validates a stack instance. @@ -135,6 +137,7 @@ def __init__( image_builder: Image builder component of the stack. model_registry: Model registry component of the stack. deployer: Deployer component of the stack. + log_store: Log store component of the stack. """ self._id = id self._name = name @@ -153,6 +156,7 @@ def __init__( self._model_registry = model_registry self._image_builder = image_builder self._deployer = deployer + self._log_store = log_store @classmethod def from_model(cls, stack_model: "StackResponse") -> "Stack": @@ -239,6 +243,7 @@ def from_components( from zenml.experiment_trackers import BaseExperimentTracker from zenml.feature_stores import BaseFeatureStore from zenml.image_builders import BaseImageBuilder + from zenml.log_stores import BaseLogStore from zenml.model_deployers import BaseModelDeployer from zenml.model_registries import BaseModelRegistry from zenml.orchestrators import BaseOrchestrator @@ -334,6 +339,10 @@ def _raise_type_error( if deployer is not None and not isinstance(deployer, BaseDeployer): _raise_type_error(deployer, BaseDeployer) + log_store = components.get(StackComponentType.LOG_STORE) + if log_store is not None and not isinstance(log_store, BaseLogStore): + _raise_type_error(log_store, BaseLogStore) + return Stack( id=id, name=name, @@ -352,6 +361,7 @@ def _raise_type_error( image_builder=image_builder, model_registry=model_registry, deployer=deployer, + log_store=log_store, ) @property @@ -377,6 +387,7 @@ def components(self) -> Dict[StackComponentType, "StackComponent"]: self.image_builder, self.model_registry, self.deployer, + self.log_store, ] if component is not None } @@ -517,6 +528,15 @@ def deployer(self) -> Optional["BaseDeployer"]: """ return self._deployer + @property + def log_store(self) -> Optional["BaseLogStore"]: + """The log store of the stack. + + Returns: + The log store of the stack. + """ + return self._log_store + def dict(self) -> Dict[str, str]: """Converts the stack into a dictionary. diff --git a/src/zenml/zen_stores/schemas/component_schemas.py b/src/zenml/zen_stores/schemas/component_schemas.py index 8165f2bcc51..e3d48b1bed8 100644 --- a/src/zenml/zen_stores/schemas/component_schemas.py +++ b/src/zenml/zen_stores/schemas/component_schemas.py @@ -97,11 +97,6 @@ class StackComponentSchema(NamedSchema, table=True): }, ) - run_or_step_logs: List["LogsSchema"] = Relationship( - back_populates="artifact_store", - sa_relationship_kwargs={"cascade": "delete", "uselist": True}, - ) - connector_id: Optional[UUID] = build_foreign_key_field( source=__tablename__, target=ServiceConnectorSchema.__tablename__, diff --git a/src/zenml/zen_stores/schemas/logs_schemas.py b/src/zenml/zen_stores/schemas/logs_schemas.py index 1900b0bc488..3632930191c 100644 --- a/src/zenml/zen_stores/schemas/logs_schemas.py +++ b/src/zenml/zen_stores/schemas/logs_schemas.py @@ -45,7 +45,7 @@ class LogsSchema(BaseSchema, table=True): ) # Fields - uri: str = Field(sa_column=Column(TEXT, nullable=False)) + uri: Optional[str] = Field(sa_column=Column(TEXT, nullable=True)) source: str = Field(sa_column=Column(VARCHAR(255), nullable=False)) # Foreign Keys @@ -65,19 +65,24 @@ class LogsSchema(BaseSchema, table=True): ondelete="CASCADE", nullable=True, ) - artifact_store_id: UUID = build_foreign_key_field( + artifact_store_id: Optional[UUID] = build_foreign_key_field( source=__tablename__, target=StackComponentSchema.__tablename__, - source_column="stack_component_id", + source_column="artifact_store_id", target_column="id", ondelete="CASCADE", - nullable=False, + nullable=True, + ) + log_store_id: Optional[UUID] = build_foreign_key_field( + source=__tablename__, + target=StackComponentSchema.__tablename__, + source_column="log_store_id", + target_column="id", + ondelete="CASCADE", + nullable=True, ) # Relationships - artifact_store: Optional["StackComponentSchema"] = Relationship( - back_populates="run_or_step_logs" - ) pipeline_run: Optional["PipelineRunSchema"] = Relationship( back_populates="logs" ) @@ -111,6 +116,7 @@ def to_model( step_run_id=self.step_run_id, pipeline_run_id=self.pipeline_run_id, artifact_store_id=self.artifact_store_id, + log_store_id=self.log_store_id, ) return LogsResponse( id=self.id, From f7fe096015610aed1047011ed7ca5c8c396741a2 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Fri, 24 Oct 2025 15:17:59 +0200 Subject: [PATCH 02/81] migration --- .../versions/5c0a1c787128_add_log_stores.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py diff --git a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py new file mode 100644 index 00000000000..fcbbd89b0f7 --- /dev/null +++ b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py @@ -0,0 +1,56 @@ +"""add log stores [5c0a1c787128]. + +Revision ID: 5c0a1c787128 +Revises: 124b57b8c7b1 +Create Date: 2025-10-24 10:06:54.402219 + +""" + +import sqlalchemy as sa +import sqlmodel +from alembic import op + +# revision identifiers, used by Alembic. +# TODO: I WILL HAVE TO CHANGE THIS +revision = "5c0a1c787128" +down_revision = "124b57b8c7b1" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + """Upgrade database schema and/or data, creating a new revision.""" + with op.batch_alter_table("logs", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "log_store_id", sqlmodel.sql.sqltypes.GUID(), nullable=True + ) + ) + batch_op.alter_column("uri", existing_type=sa.TEXT(), nullable=True) + batch_op.alter_column( + "artifact_store_id", + existing_type=sa.CHAR(length=32), + nullable=True, + ) + batch_op.create_foreign_key( + "fk_logs_log_store_id_stack_component", + "stack_component", + ["log_store_id"], + ["id"], + ondelete="CASCADE", + ) + + +def downgrade() -> None: + """Downgrade database schema and/or data back to the previous revision.""" + with op.batch_alter_table("logs", schema=None) as batch_op: + batch_op.drop_constraint( + "fk_logs_log_store_id_stack_component", type_="foreignkey" + ) + batch_op.alter_column( + "artifact_store_id", + existing_type=sa.CHAR(length=32), + nullable=False, + ) + batch_op.alter_column("uri", existing_type=sa.TEXT(), nullable=False) + batch_op.drop_column("log_store_id") From e5ef506abbda19880be723e7a654dfe46b88da1f Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Wed, 29 Oct 2025 09:20:33 +0100 Subject: [PATCH 03/81] second check --- src/zenml/log_stores/__init__.py | 4 +- src/zenml/log_stores/base_log_store.py | 102 ++++++++++ src/zenml/logging/step_logging.py | 186 +++++------------- src/zenml/models/v2/core/logs.py | 56 ++++-- src/zenml/orchestrators/step_launcher.py | 39 ++-- src/zenml/orchestrators/step_runner.py | 11 +- src/zenml/pipelines/pipeline_definition.py | 30 +-- .../zen_server/routers/runs_endpoints.py | 12 +- .../zen_server/routers/steps_endpoints.py | 13 +- .../zen_stores/schemas/component_schemas.py | 1 - src/zenml/zen_stores/sql_zen_store.py | 6 + 11 files changed, 248 insertions(+), 212 deletions(-) create mode 100644 src/zenml/log_stores/base_log_store.py diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index a520ebd4803..a59f87e105d 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -17,8 +17,8 @@ """ from zenml.log_stores.base_log_store import BaseLogStore, BaseLogStoreConfig -from zenml.log_stores.datadog_flavor import DatadogLogStoreFlavor -from zenml.log_stores.otel_flavor import OtelLogStoreFlavor +from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreFlavor +from zenml.log_stores.otel.otel_flavor import OtelLogStoreFlavor from zenml.log_stores.utils import fetch_logs __all__ = [ diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py new file mode 100644 index 00000000000..95ce11ed231 --- /dev/null +++ b/src/zenml/log_stores/base_log_store.py @@ -0,0 +1,102 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Base class for log stores.""" + +from abc import abstractmethod +from datetime import datetime +from typing import TYPE_CHECKING, List, Optional, cast + +from zenml.stack import StackComponent, StackComponentConfig + +if TYPE_CHECKING: + from zenml.logging.step_logging import LogEntry + from zenml.models import LogsResponse + + +class BaseLogStoreConfig(StackComponentConfig): + """Base configuration for all log stores.""" + + +class BaseLogStore(StackComponent): + """Base class for all ZenML log stores. + + A log store is responsible for collecting, storing, and retrieving logs + during pipeline and step execution. Different implementations may store + logs in different backends (artifact store, OpenTelemetry, Datadog, etc.). + """ + + @property + def config(self) -> BaseLogStoreConfig: + """Returns the configuration of the log store. + + Returns: + The configuration. + """ + return cast(BaseLogStoreConfig, self._config) + + @abstractmethod + def activate( + self, + source: str = "step", + ) -> None: + """Activate the log store for log collection. + + This method is called when ZenML needs to start collecting and storing + logs during pipeline or step execution. It should set up any necessary + handlers, threads, or connections. + + Args: + pipeline_run_id: The ID of the pipeline run. + step_id: The ID of the step (if collecting step logs). + source: The source of the logs (e.g., "step", "orchestrator"). + """ + + @abstractmethod + def deactivate(self) -> None: + """Deactivate the log store and stop log collection. + + This method is called when ZenML needs to stop collecting logs. + It should clean up handlers, flush any pending logs, and shut down + any background threads or connections. + """ + + @abstractmethod + def fetch( + self, + logs_model: "LogsResponse", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + limit: int = 20000, + ) -> List["LogEntry"]: + """Fetch logs from the log store. + + This method is called from the server to retrieve logs for display + on the dashboard or via API. The implementation should not require + any integration-specific SDKs that aren't available on the server. + + Each log store implementation can extract the information it needs + from logs_model: + - DefaultLogStore: uses logs_model.uri and logs_model.artifact_store_id + - OtelLogStore: uses logs_model.pipeline_run_id, step_run_id, source + - DatadogLogStore: uses logs_model.pipeline_run_id, step_run_id, source + + Args: + logs_model: The logs model containing metadata about the logs. + start_time: Filter logs after this time. + end_time: Filter logs before this time. + limit: Maximum number of log entries to return. + + Returns: + List of log entries matching the query. + """ diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/step_logging.py index 52c27aeafec..31b835bda65 100644 --- a/src/zenml/logging/step_logging.py +++ b/src/zenml/logging/step_logging.py @@ -41,7 +41,6 @@ from zenml.client import Client from zenml.constants import ( ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, - ENV_ZENML_DISABLE_STEP_NAMES_IN_LOGS, LOGS_MERGE_INTERVAL_SECONDS, LOGS_STORAGE_MAX_QUEUE_SIZE, LOGS_STORAGE_QUEUE_TIMEOUT, @@ -50,16 +49,14 @@ ) from zenml.enums import LoggingLevels from zenml.exceptions import DoesNotExistException +from zenml.log_stores.default_log_store import DefaultLogStore from zenml.logger import ( get_logger, get_storage_log_level, - logging_handlers, - step_names_in_console, ) from zenml.models import ( LogsRequest, LogsResponse, - PipelineRunUpdate, PipelineSnapshotResponse, ) from zenml.utils.io_utils import sanitize_remote_path @@ -729,78 +726,55 @@ def send_merge_event(self) -> None: self.merge_event.set() -class PipelineLogsStorageContext: - """Context manager which collects logs during pipeline run execution.""" +class LoggingContext: + """Context manager which collects logs using a LogStore.""" - def __init__( - self, - logs_uri: str, - artifact_store: "BaseArtifactStore", - prepend_step_name: bool = True, - ) -> None: - """Initializes and prepares a storage object. + def __init__(self, source: str = "step") -> None: + """Initialize the logging context. Args: - logs_uri: the URI of the logs file. - artifact_store: Artifact Store from the current pipeline run context. - prepend_step_name: Whether to prepend the step name to the logs. + source: An identifier for the source of the logs (e.g., "step", "orchestrator") """ - # Create the storage object - self.storage = PipelineLogsStorage( - logs_uri=logs_uri, artifact_store=artifact_store - ) + self.source = source - # Create the handler object - self.artifact_store_handler: ArtifactStoreHandler = ( - ArtifactStoreHandler(self.storage) - ) + self.log_request_id = uuid4() - # Additional configuration - self.prepend_step_name = prepend_step_name - self.original_step_names_in_console: Optional[bool] = None - self._original_root_level: Optional[int] = None + try: + self.log_store = Client().active_stack.log_store + except AttributeError: + from zenml.log_stores.default_log_store import DefaultLogStore - def __enter__(self) -> "PipelineLogsStorageContext": - """Enter condition of the context manager. + self.log_store = DefaultLogStore() - Registers an ArtifactStoreHandler for log storage. + def create_log_request(self) -> "LogsRequest": + """Create a log request model. - Returns: - self + In their structure, LogRequest objects do not feature an entity ID or type, they + are rather used within other request model that does the assignment automatically. + That's why everytime we start a context, we need to be able to create the + corresponding LogRequest object. """ - # Add handler to root logger - root_logger = logging.getLogger() - root_logger.addHandler(self.artifact_store_handler) - - # Set root logger level to minimum of all active handlers - # This ensures records can reach any handler that needs them - self._original_root_level = root_logger.level - handler_levels = [handler.level for handler in root_logger.handlers] - - # Set root logger to the minimum level among all handlers - min_level = min(handler_levels) - if min_level < root_logger.level: - root_logger.setLevel(min_level) - - # Add handler to context variables for print() capture - logging_handlers.add(self.artifact_store_handler) - - # Save the current step names context variable state - self.original_step_names_in_console = step_names_in_console.get() - - # Set the step names context variable - step_names_disabled = handle_bool_env_var( - ENV_ZENML_DISABLE_STEP_NAMES_IN_LOGS, default=False - ) - - if step_names_disabled or not self.prepend_step_name: - # Step names are disabled through the env or they are disabled in the config - step_names_in_console.set(False) + if isinstance(self.log_store, DefaultLogStore): + return LogsRequest( + id=self.log_request_id, + source=self.source, + uri=self.log_store.uri, + artifact_store_id=self.log_store.artifact_store_id, + ) else: - # Otherwise, set it True (default) - step_names_in_console.set(True) + return LogsRequest( + id=self.log_request_id, + source=self.source, + log_store_id=self.log_store.id, + ) + + def __enter__(self) -> "LoggingContext": + """Enter the context and activate log collection. - redirected.set(True) + Returns: + self + """ + self.log_store.activate(source=self.source) return self def __exit__( @@ -809,52 +783,20 @@ def __exit__( exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: - """Exit condition of the context manager. + """Exit the context and deactivate log collection. Args: - exc_type: The class of the exception - exc_val: The instance of the exception - exc_tb: The traceback of the exception - - Removes the handler from loggers and context variables. + exc_type: The class of the exception. + exc_val: The instance of the exception. + exc_tb: The traceback of the exception. """ if exc_type is not None: - # Write the exception and its traceback to the logs - self.artifact_store_handler.emit( - logging.LogRecord( - name="exception", - level=logging.ERROR, - pathname="", - lineno=0, - msg="An exception has occurred.", - args=(), - exc_info=(exc_type, exc_val, exc_tb) if exc_val else None, - ) + logger.error( + "An exception has occurred.", + exc_info=(exc_type, exc_val, exc_tb) if exc_val else None, ) - # Remove handler from root logger and restore original level - root_logger = logging.getLogger() - - # Check if handler is still in the root logger before removing - if self.artifact_store_handler in root_logger.handlers: - root_logger.removeHandler(self.artifact_store_handler) - - # Restore original root logger level - if self._original_root_level is not None: - root_logger.setLevel(self._original_root_level) - - # Remove handler from context variables - logging_handlers.remove(self.artifact_store_handler) - - # Shutdown thread (it will automatically drain queue and merge files) - try: - self.storage._shutdown_log_storage_thread() - except Exception: - pass - - # Restore the original step names context variable state - if self.original_step_names_in_console is not None: - step_names_in_console.set(self.original_step_names_in_console) + self.log_store.deactivate() def setup_orchestrator_logging( @@ -873,7 +815,7 @@ def setup_orchestrator_logging( logs_response: The logs response to continue from. Returns: - The logs context (PipelineLogsStorageContext) + The logs context """ try: logging_enabled = True @@ -892,39 +834,7 @@ def setup_orchestrator_logging( if not logging_enabled: return nullcontext() - # Fetch the active stack - client = Client() - active_stack = client.active_stack - - if logs_response: - logs_uri = logs_response.uri - else: - logs_uri = prepare_logs_uri( - artifact_store=active_stack.artifact_store, - ) - logs_model = LogsRequest( - uri=logs_uri, - source="orchestrator", - artifact_store_id=active_stack.artifact_store.id, - ) - - # Add orchestrator logs to the pipeline run - try: - run_update = PipelineRunUpdate(add_logs=[logs_model]) - client.zen_store.update_run( - run_id=run_id, run_update=run_update - ) - except Exception as e: - logger.error( - f"Failed to add orchestrator logs to the run {run_id}: {e}" - ) - raise e - - return PipelineLogsStorageContext( - logs_uri=logs_uri, - artifact_store=active_stack.artifact_store, - prepend_step_name=False, - ) + return LoggingContext(source="orchestrator") except Exception as e: logger.error( f"Failed to setup orchestrator logging for run {run_id}: {e}" diff --git a/src/zenml/models/v2/core/logs.py b/src/zenml/models/v2/core/logs.py index ae67c8dc635..de1bff032e1 100644 --- a/src/zenml/models/v2/core/logs.py +++ b/src/zenml/models/v2/core/logs.py @@ -14,7 +14,7 @@ """Models representing logs.""" from typing import Any, Optional -from uuid import UUID +from uuid import UUID, uuid4 from pydantic import Field, field_validator @@ -33,11 +33,23 @@ class LogsRequest(BaseRequest): """Request model for logs.""" - uri: str = Field(title="The uri of the logs file") + id: UUID = Field( + default_factory=uuid4, + title="The unique id.", + ) + uri: Optional[str] = Field( + default=None, + title="The URI of the logs file (for artifact store logs)", + ) # TODO: Remove default value when not supporting clients <0.84.0 anymore source: str = Field(default="", title="The source of the logs file") - artifact_store_id: UUID = Field( - title="The artifact store ID to associate the logs with.", + artifact_store_id: Optional[UUID] = Field( + default=None, + title="The artifact store ID (for artifact store logs)", + ) + log_store_id: Optional[UUID] = Field( + default=None, + title="The log store ID that collected these logs", ) @field_validator("uri") @@ -55,10 +67,11 @@ def text_field_max_length_check(cls, value: Any) -> Any: AssertionError: if the length of the field is longer than the maximum threshold. """ - assert len(str(value)) < TEXT_FIELD_MAX_LENGTH, ( - "The length of the value for this field can not " - f"exceed {TEXT_FIELD_MAX_LENGTH}" - ) + if value is not None: + assert len(str(value)) < TEXT_FIELD_MAX_LENGTH, ( + "The length of the value for this field can not " + f"exceed {TEXT_FIELD_MAX_LENGTH}" + ) return value @@ -72,8 +85,9 @@ def text_field_max_length_check(cls, value: Any) -> Any: class LogsResponseBody(BaseDatedResponseBody): """Response body for logs.""" - uri: str = Field( - title="The uri of the logs file", + uri: Optional[str] = Field( + default=None, + title="The URI of the logs file (for artifact store logs)", max_length=TEXT_FIELD_MAX_LENGTH, ) source: str = Field( @@ -95,8 +109,13 @@ class LogsResponseMetadata(BaseResponseMetadata): default=None, description="When this is set, step_run_id should be set to None.", ) - artifact_store_id: UUID = Field( - title="The artifact store ID to associate the logs with.", + artifact_store_id: Optional[UUID] = Field( + default=None, + title="The artifact store ID that collected these logs", + ) + log_store_id: Optional[UUID] = Field( + default=None, + title="The log store ID that collected these logs", ) @@ -123,7 +142,7 @@ def get_hydrated_version(self) -> "LogsResponse": # Body and metadata properties @property - def uri(self) -> str: + def uri(self) -> Optional[str]: """The `uri` property. Returns: @@ -159,7 +178,7 @@ def pipeline_run_id(self) -> Optional[UUID]: return self.get_metadata().pipeline_run_id @property - def artifact_store_id(self) -> UUID: + def artifact_store_id(self) -> Optional[UUID]: """The `artifact_store_id` property. Returns: @@ -167,6 +186,15 @@ def artifact_store_id(self) -> UUID: """ return self.get_metadata().artifact_store_id + @property + def log_store_id(self) -> Optional[UUID]: + """The `log_store_id` property. + + Returns: + the value of the property. + """ + return self.get_metadata().log_store_id + # ------------------ Filter Model ------------------ diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 050f338b9e6..1ec740df8f1 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -33,7 +33,6 @@ from zenml.logger import get_logger from zenml.logging import step_logging from zenml.models import ( - LogsRequest, PipelineRunRequest, PipelineRunResponse, PipelineSnapshotResponse, @@ -270,21 +269,15 @@ def launch(self) -> None: logs_model = None if step_logging_enabled: - # Configure the logs - logs_uri = step_logging.prepare_logs_uri( - artifact_store=self._stack.artifact_store, - step_name=self._invocation_id, - ) + from zenml.enums import LoggableEntityType - logs_context = step_logging.PipelineLogsStorageContext( - logs_uri=logs_uri, artifact_store=self._stack.artifact_store + logs_context = step_logging.LoggingContext( + entity_type=LoggableEntityType.PIPELINE_RUN, + entity_id=pipeline_run.id, + source="execution", ) # type: ignore[assignment] - logs_model = LogsRequest( - uri=logs_uri, - source="execution", - artifact_store_id=self._stack.artifact_store.id, - ) + logs_model = logs_context.create_log_request() with logs_context: if run_was_created: @@ -339,11 +332,25 @@ def launch(self) -> None: # the external jobs in step operators if isinstance( logs_context, - step_logging.PipelineLogsStorageContext, + step_logging.LoggingContext, ): - force_write_logs = ( - logs_context.storage.send_merge_event + # For LoggingContext using DefaultLogStore, trigger merge + from zenml.log_stores.default_log_store import ( + DefaultLogStore, ) + + if isinstance( + logs_context.log_store, DefaultLogStore + ) and hasattr(logs_context.log_store, "storage"): + force_write_logs = ( + logs_context.log_store.storage.send_merge_event + ) + else: + + def _bypass() -> None: + return None + + force_write_logs = _bypass else: def _bypass() -> None: diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index 2f248389ade..058de4a09e7 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -38,11 +38,11 @@ handle_bool_env_var, ) from zenml.deployers.server import runtime -from zenml.enums import ArtifactSaveType +from zenml.enums import ArtifactSaveType, LoggableEntityType from zenml.exceptions import StepInterfaceError from zenml.hooks.hook_validators import load_and_run_hook from zenml.logger import get_logger -from zenml.logging.step_logging import PipelineLogsStorageContext, redirected +from zenml.logging.step_logging import LoggingContext, redirected from zenml.materializers.base_materializer import BaseMaterializer from zenml.materializers.in_memory_materializer import InMemoryMaterializer from zenml.models.v2.core.step_run import ( @@ -151,9 +151,10 @@ def run( logs_context = nullcontext() if step_logging_enabled and not redirected.get(): if step_run.logs: - logs_context = PipelineLogsStorageContext( # type: ignore[assignment] - logs_uri=step_run.logs.uri, - artifact_store=self._stack.artifact_store, + logs_context = LoggingContext( # type: ignore[assignment] + entity_type=LoggableEntityType.STEP_RUN, + entity_id=step_run.id, + source="step", ) else: logger.debug( diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index f388dc5f581..b9ff9e1736f 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -58,14 +58,10 @@ from zenml.exceptions import EntityExistsError from zenml.hooks.hook_validators import resolve_and_validate_hook from zenml.logger import get_logger -from zenml.logging.step_logging import ( - PipelineLogsStorageContext, - prepare_logs_uri, -) +from zenml.logging.step_logging import LoggingContext from zenml.models import ( CodeReferenceRequest, DeploymentResponse, - LogsRequest, PipelineBuildBase, PipelineBuildResponse, PipelineRequest, @@ -962,32 +958,20 @@ def _run( ) logs_context = nullcontext() - logs_model = None + logs_request = None if logging_enabled: - # Configure the logs - logs_uri = prepare_logs_uri( - stack.artifact_store, - ) - - logs_context = PipelineLogsStorageContext( - logs_uri=logs_uri, - artifact_store=stack.artifact_store, - prepend_step_name=False, - ) # type: ignore[assignment] - - logs_model = LogsRequest( - uri=logs_uri, - source="client", - artifact_store_id=stack.artifact_store.id, - ) + logs_context = LoggingContext() + logs_request = logs_context.generate_logs_request() with logs_context: snapshot = self._create_snapshot(**self._run_args) self.log_pipeline_snapshot_metadata(snapshot) run = ( - create_placeholder_run(snapshot=snapshot, logs=logs_model) + create_placeholder_run( + snapshot=snapshot, logs=logs_request + ) if not snapshot.schedule else None ) diff --git a/src/zenml/zen_server/routers/runs_endpoints.py b/src/zenml/zen_server/routers/runs_endpoints.py index 1707d8bd87d..7f50b2adc84 100644 --- a/src/zenml/zen_server/routers/runs_endpoints.py +++ b/src/zenml/zen_server/routers/runs_endpoints.py @@ -30,11 +30,11 @@ VERSION_1, ) from zenml.enums import ExecutionStatus +from zenml.log_stores import fetch_logs from zenml.logger import get_logger from zenml.logging.step_logging import ( MAX_ENTRIES_PER_REQUEST, LogEntry, - fetch_log_records, parse_log_entry, ) from zenml.models import ( @@ -485,12 +485,12 @@ def run_logs( # Handle logs from log collection if run.log_collection: - for log_entry in run.log_collection: - if log_entry.source == source: - return fetch_log_records( + for logs_response in run.log_collection: + if logs_response.source == source: + return fetch_logs( + logs=logs_response, zen_store=store, - artifact_store_id=log_entry.artifact_store_id, - logs_uri=log_entry.uri, + limit=MAX_ENTRIES_PER_REQUEST, ) # If no logs found for the specified source, raise an error diff --git a/src/zenml/zen_server/routers/steps_endpoints.py b/src/zenml/zen_server/routers/steps_endpoints.py index 772d480348c..0014a8ed653 100644 --- a/src/zenml/zen_server/routers/steps_endpoints.py +++ b/src/zenml/zen_server/routers/steps_endpoints.py @@ -21,15 +21,16 @@ from zenml.constants import ( API, LOGS, + MAX_ENTRIES_PER_REQUEST, STATUS, STEP_CONFIGURATION, STEPS, VERSION_1, ) from zenml.enums import ExecutionStatus +from zenml.log_stores import fetch_logs from zenml.logging.step_logging import ( LogEntry, - fetch_log_records, ) from zenml.models import ( Page, @@ -277,14 +278,12 @@ def get_step_logs( pipeline_run = zen_store().get_run(step.pipeline_run_id) verify_permission_for_model(pipeline_run, action=Action.READ) - store = zen_store() - # Verify that logs are available for this step if step.logs is None: raise KeyError("No logs available for this step.") - return fetch_log_records( - zen_store=store, - artifact_store_id=step.logs.artifact_store_id, - logs_uri=step.logs.uri, + return fetch_logs( + logs=step.logs, + zen_store=zen_store(), + limit=MAX_ENTRIES_PER_REQUEST, ) diff --git a/src/zenml/zen_stores/schemas/component_schemas.py b/src/zenml/zen_stores/schemas/component_schemas.py index e3d48b1bed8..8f467c216c7 100644 --- a/src/zenml/zen_stores/schemas/component_schemas.py +++ b/src/zenml/zen_stores/schemas/component_schemas.py @@ -44,7 +44,6 @@ if TYPE_CHECKING: from zenml.zen_stores.schemas.flavor_schemas import FlavorSchema - from zenml.zen_stores.schemas.logs_schemas import LogsSchema from zenml.zen_stores.schemas.run_metadata_schemas import RunMetadataSchema from zenml.zen_stores.schemas.schedule_schema import ScheduleSchema from zenml.zen_stores.schemas.secret_schemas import SecretSchema diff --git a/src/zenml/zen_stores/sql_zen_store.py b/src/zenml/zen_stores/sql_zen_store.py index 3cf0af8b6dd..d9a46ee2648 100644 --- a/src/zenml/zen_stores/sql_zen_store.py +++ b/src/zenml/zen_stores/sql_zen_store.py @@ -6237,12 +6237,14 @@ def _create_run( ) log_entry = LogsSchema( + id=pipeline_run.logs.id, uri=pipeline_run.logs.uri, # TODO: Remove fallback when not supporting # clients <0.84.0 anymore source=pipeline_run.logs.source or "client", pipeline_run_id=new_run.id, artifact_store_id=pipeline_run.logs.artifact_store_id, + log_store_id=pipeline_run.logs.log_store_id, ) try: session.add(log_entry) @@ -6721,12 +6723,14 @@ def update_run( # Create the log entry log_entry = LogsSchema( + id=log_request.id, uri=log_request.uri, # TODO: Remove fallback when not supporting # clients <0.84.0 anymore source=log_request.source or "orchestrator", pipeline_run_id=existing_run.id, artifact_store_id=log_request.artifact_store_id, + log_store_id=log_request.log_store_id, ) session.add(log_entry) @@ -9758,12 +9762,14 @@ def create_run_step(self, step_run: StepRunRequest) -> StepRunResponse: ) log_entry = LogsSchema( + id=step_run.logs.id, uri=step_run.logs.uri, # TODO: Remove fallback when not supporting # clients <0.84.0 anymore source=step_run.logs.source or "execution", step_run_id=step_schema.id, artifact_store_id=step_run.logs.artifact_store_id, + log_store_id=step_run.logs.log_store_id, ) try: session.add(log_entry) From a2a88b3270cfd77313f54a9f33429b014d6d91a6 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Wed, 29 Oct 2025 09:51:19 +0100 Subject: [PATCH 04/81] docs checkpoint --- .../book/component-guide/log-stores/README.md | 90 +++++++ .../book/component-guide/log-stores/custom.md | 238 ++++++++++++++++++ .../component-guide/log-stores/datadog.md | 104 ++++++++ docs/book/component-guide/log-stores/otel.md | 81 ++++++ docs/book/component-guide/toc.md | 4 + 5 files changed, 517 insertions(+) create mode 100644 docs/book/component-guide/log-stores/README.md create mode 100644 docs/book/component-guide/log-stores/custom.md create mode 100644 docs/book/component-guide/log-stores/datadog.md create mode 100644 docs/book/component-guide/log-stores/otel.md diff --git a/docs/book/component-guide/log-stores/README.md b/docs/book/component-guide/log-stores/README.md new file mode 100644 index 00000000000..44c493d333f --- /dev/null +++ b/docs/book/component-guide/log-stores/README.md @@ -0,0 +1,90 @@ +--- +description: Collecting and storing logs from your pipeline runs. +icon: file-lines +--- + +# Log Stores + +The Log Store is a stack component that handles the collection, storage, and retrieval of logs generated during pipeline and step execution. It provides a centralized way to manage logs across different backends. + +ZenML automatically captures logs from your pipeline runs, including stdout, stderr, and any logging output from your steps. The Log Store determines where these logs are stored and how they can be retrieved later for debugging and monitoring. + +{% hint style="info" %} +By default, if no Log Store is configured in your stack, ZenML will automatically use the Artifact Store as a fallback location for storing logs. This ensures backward compatibility and that logs are always captured without requiring additional configuration. +{% endhint %} + +### When to use it + +The Log Store is an optional component in the ZenML stack. While ZenML provides a default fallback mechanism (using the Artifact Store), you may want to configure a dedicated Log Store when you need: + +* **Centralized logging infrastructure**: Send logs to your existing logging platform (e.g., Datadog, Elasticsearch, Splunk) +* **Real-time log streaming**: View logs as they are generated during pipeline execution +* **Advanced log analysis**: Use specialized logging platforms for searching, filtering, and analyzing logs +* **Compliance requirements**: Store logs in specific systems for regulatory or audit purposes + +#### Log Store Flavors + +Out of the box, ZenML provides several Log Store implementations: + +| Log Store | Flavor | Integration | Notes | +| ------------------ | ---------- | ----------- | ----------------------------------------------------------------------------- | +| [OpenTelemetry](otel.md) | `otel` | _built-in_ | Generic OpenTelemetry-based log store that can export to various backends | +| [Datadog](datadog.md) | `datadog` | _built-in_ | Sends logs directly to Datadog's logging platform | +| [Custom Implementation](custom.md) | _custom_ | | Extend the Log Store abstraction and provide your own implementation | + +If you would like to see the available flavors of Log Stores, you can use the command: + +```shell +zenml log-store flavor list +``` + +### How to use it + +The Log Store works automatically once configured in your stack. You don't need to make any changes to your pipeline code. All logging output, print statements, and errors are automatically captured and sent to the configured Log Store. + +#### Basic Setup + +To register and configure a Log Store: + +```shell +# Register a log store +zenml log-store register my_datadog_logs --flavor datadog \ + --api_key= \ + --site=datadoghq.com + +# Add it to your stack +zenml stack update -l my_datadog_logs +``` + +Once configured, all subsequent pipeline runs will send their logs to the configured Log Store. + +#### Viewing Logs + +Logs can be viewed through: + +1. **ZenML Dashboard**: View logs directly in the pipeline run UI +2. **CLI**: Use `zenml logs` commands to fetch and display logs +3. **External Platform**: Access logs directly in your logging platform (e.g., Datadog UI) + +#### Log Metadata + +All logs captured by ZenML include important metadata: + +* `pipeline_run_id`: The unique identifier of the pipeline run +* `step_id`: The unique identifier of the step (if applicable) +* `source`: Where the logs originated from (e.g., "step", "orchestrator") + +This metadata allows you to filter and query logs effectively in your logging platform. + +#### Fallback Behavior + +If no Log Store is configured in your stack, ZenML will: + +1. Automatically use the Artifact Store as the storage backend +2. Save logs as files in the artifact store +3. Make logs accessible through the same APIs and UI + +This ensures that logs are always captured and retrievable, even without explicit Log Store configuration. + +
ZenML Scarf
+ diff --git a/docs/book/component-guide/log-stores/custom.md b/docs/book/component-guide/log-stores/custom.md new file mode 100644 index 00000000000..ba628722c81 --- /dev/null +++ b/docs/book/component-guide/log-stores/custom.md @@ -0,0 +1,238 @@ +--- +description: Developing a custom log store. +--- + +# Develop a Custom Log Store + +If you want to send logs to a backend that isn't covered by the built-in log stores, you can create your own custom log store implementation. + +### Base Abstraction + +The `BaseLogStore` provides three main methods that you need to implement: + +```python +from zenml.log_stores import BaseLogStore, BaseLogStoreConfig + +class MyLogStoreConfig(BaseLogStoreConfig): + """Configuration for my custom log store.""" + + my_setting: str + another_setting: int = 100 + +class MyLogStore(BaseLogStore): + """My custom log store implementation.""" + + @property + def config(self) -> MyLogStoreConfig: + return cast(MyLogStoreConfig, self._config) + + def activate( + self, + pipeline_run_id: UUID, + step_id: Optional[UUID] = None, + source: str = "step", + ) -> None: + """Activate log collection. + + This is called at the start of a pipeline run or step. + Set up your logging handlers, connections, and any + background processing here. + """ + pass + + def deactivate(self) -> None: + """Deactivate log collection and clean up. + + This is called at the end of a pipeline run or step. + Flush any pending logs, close connections, and clean + up resources here. + """ + pass + + def fetch( + self, + pipeline_run_id: UUID, + step_id: Optional[UUID] = None, + source: Optional[str] = None, + logs_uri: Optional[str] = None, + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + limit: int = 20000, + ) -> List[LogEntry]: + """Fetch logs from the backend. + + This is called by the server to retrieve logs for display. + Query your backend and return logs as LogEntry objects. + """ + return [] +``` + +### Implementation Patterns + +#### 1. Using Python Logging Handlers + +The most common pattern is to create a `logging.Handler` that sends logs to your backend: + +```python +import logging +from zenml.log_stores import BaseLogStore +from zenml.logger import logging_handlers, get_storage_log_level + +class MyLogStore(BaseLogStore): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.handler = None + self._original_root_level = None + + def activate(self, pipeline_run_id, step_id=None, source="step"): + self.handler = MyCustomHandler( + backend_url=self.config.backend_url, + pipeline_run_id=pipeline_run_id, + step_id=step_id, + ) + + self.handler.setLevel(get_storage_log_level().value) + + root_logger = logging.getLogger() + root_logger.addHandler(self.handler) + + self._original_root_level = root_logger.level + handler_levels = [h.level for h in root_logger.handlers] + root_logger.setLevel(min(handler_levels)) + + logging_handlers.add(self.handler) + + def deactivate(self): + if not self.handler: + return + + root_logger = logging.getLogger() + if self.handler in root_logger.handlers: + root_logger.removeHandler(self.handler) + + if self._original_root_level is not None: + root_logger.setLevel(self._original_root_level) + + logging_handlers.remove(self.handler) +``` + +#### 2. Background Processing + +For efficient log handling, use background threads or async processing: + +```python +import queue +import threading + +class MyLogStore(BaseLogStore): + def activate(self, pipeline_run_id, step_id=None, source="step"): + self.log_queue = queue.Queue(maxsize=2048) + self.shutdown_event = threading.Event() + self.worker_thread = threading.Thread( + target=self._process_logs, + daemon=True + ) + self.worker_thread.start() + + def _process_logs(self): + while not self.shutdown_event.is_set(): + try: + log_entry = self.log_queue.get(timeout=1) + self._send_to_backend(log_entry) + except queue.Empty: + continue + + def deactivate(self): + self.shutdown_event.set() + if self.worker_thread: + self.worker_thread.join(timeout=5) +``` + +#### 3. Fetching Logs + +Implement fetch using HTTP APIs or SDKs: + +```python +from zenml.logging.step_logging import LogEntry + +class MyLogStore(BaseLogStore): + def fetch( + self, + pipeline_run_id, + step_id=None, + source=None, + logs_uri=None, + start_time=None, + end_time=None, + limit=20000, + ): + query = { + "pipeline_run_id": str(pipeline_run_id), + } + if step_id: + query["step_id"] = str(step_id) + if start_time: + query["start_time"] = start_time.isoformat() + if end_time: + query["end_time"] = end_time.isoformat() + + response = requests.post( + f"{self.config.backend_url}/query", + json=query, + headers={"Authorization": f"Bearer {self.config.api_key}"} + ) + + logs = [] + for log_data in response.json()["logs"][:limit]: + logs.append(LogEntry( + message=log_data["message"], + level=log_data.get("level"), + timestamp=log_data.get("timestamp"), + )) + return logs +``` + +### Creating a Flavor + +To make your log store usable via CLI, create a flavor: + +```python +from zenml.enums import StackComponentType +from zenml.stack.flavor import Flavor + +class MyLogStoreFlavor(Flavor): + @property + def name(self) -> str: + return "my_custom_store" + + @property + def type(self) -> StackComponentType: + return StackComponentType.LOG_STORE + + @property + def config_class(self) -> Type[BaseLogStoreConfig]: + from my_module import MyLogStoreConfig + return MyLogStoreConfig + + @property + def implementation_class(self) -> Type[BaseLogStore]: + from my_module import MyLogStore + return MyLogStore +``` + +Register your flavor: + +```bash +zenml log-store flavor register my_module.MyLogStoreFlavor +``` + +Then use it: + +```bash +zenml log-store register my_logs --flavor my_custom_store \ + --backend_url=https://logs.example.com \ + --api_key=secret +``` + +
ZenML Scarf
+ diff --git a/docs/book/component-guide/log-stores/datadog.md b/docs/book/component-guide/log-stores/datadog.md new file mode 100644 index 00000000000..676094addbc --- /dev/null +++ b/docs/book/component-guide/log-stores/datadog.md @@ -0,0 +1,104 @@ +--- +description: Sending logs directly to Datadog. +--- + +# Datadog Log Store + +The Datadog Log Store sends logs directly to Datadog's logging platform using their HTTP intake API. It extends the OpenTelemetry Log Store and adds Datadog-specific formatting and tagging. + +### When to use it + +Use the Datadog Log Store when you: + +* Already use Datadog for monitoring and observability +* Want to correlate pipeline logs with other Datadog metrics and traces +* Need advanced log analysis and visualization features +* Want to set up alerts based on pipeline log patterns + +### How to deploy it + +The Datadog Log Store requires the OpenTelemetry SDK to be installed: + +```bash +pip install opentelemetry-sdk requests +``` + +### How to use it + +First, obtain your Datadog API key from the Datadog UI (Organization Settings → API Keys). + +Register a Datadog log store: + +```bash +zenml log-store register datadog_logs --flavor datadog \ + --api_key= \ + --site=datadoghq.com +``` + +For EU customers, use `datadoghq.eu`: + +```bash +zenml log-store register datadog_logs --flavor datadog \ + --api_key= \ + --site=datadoghq.eu +``` + +Add it to your stack: + +```bash +zenml stack update -l datadog_logs +``` + +#### Configuration Options + +The Datadog Log Store supports all OpenTelemetry Log Store options plus: + +* `api_key`: Your Datadog API key (required) +* `site`: The Datadog site (default: "datadoghq.com") + * US: `datadoghq.com` + * EU: `datadoghq.eu` + * Other regions: check Datadog documentation +* `additional_tags`: Additional tags to add to all logs (optional) + +#### Log Tags + +All logs sent to Datadog include the following tags for easy filtering: + +* `service:`: The service name from your configuration +* `zenml.pipeline_run_id:`: The pipeline run identifier +* `zenml.step_id:`: The step identifier (if applicable) +* `zenml.source:`: The log source ("step" or "orchestrator") +* `deployment.environment:`: The deployment environment + +#### Viewing Logs in Datadog + +Once configured, logs will appear in the Datadog Logs Explorer. You can: + +1. Go to Datadog → Logs → Search +2. Filter by service: `service:zenml-pipelines` +3. Filter by pipeline: `zenml.pipeline_run_id:` +4. Filter by step: `zenml.step_id:` + +#### Example: Production Setup + +```bash +zenml log-store register prod_datadog_logs --flavor datadog \ + --api_key=$DATADOG_API_KEY \ + --site=datadoghq.com \ + --service_name=ml-pipelines \ + --deployment_environment=production \ + --additional_tags='{"team":"ml-platform","project":"recommendation-system"}' +``` + +#### Setting Up Alerts + +In Datadog, you can create log-based alerts: + +1. Go to Datadog → Logs → Configuration → Log Alerts +2. Create a new monitor +3. Set the query to filter your pipeline logs (e.g., `service:zenml-pipelines @zenml.pipeline_run_id:*`) +4. Define alert conditions (e.g., error rate threshold) +5. Configure notifications + +
ZenML Scarf
+ diff --git a/docs/book/component-guide/log-stores/otel.md b/docs/book/component-guide/log-stores/otel.md new file mode 100644 index 00000000000..58055146c54 --- /dev/null +++ b/docs/book/component-guide/log-stores/otel.md @@ -0,0 +1,81 @@ +--- +description: Exporting logs using OpenTelemetry. +--- + +# OpenTelemetry Log Store + +The OpenTelemetry Log Store allows you to export logs to any OpenTelemetry-compatible backend. It uses the OpenTelemetry SDK to collect and export logs with structured metadata. + +### When to use it + +Use the OpenTelemetry Log Store when you: + +* Want to send logs to any OpenTelemetry-compatible backend +* Need structured logging with rich metadata +* Want to integrate with your existing OpenTelemetry infrastructure +* Need flexibility to change backends without changing your ZenML configuration + +### How to deploy it + +The OpenTelemetry Log Store requires the OpenTelemetry SDK to be installed: + +```bash +pip install opentelemetry-sdk opentelemetry-exporter-otlp +``` + +### How to use it + +Register an OpenTelemetry log store: + +```bash +zenml log-store register otel_logs --flavor otel \ + --service_name=zenml-pipelines \ + --endpoint=http://otel-collector:4318 +``` + +Add it to your stack: + +```bash +zenml stack update -l otel_logs +``` + +#### Configuration Options + +The OpenTelemetry Log Store supports the following configuration options: + +* `service_name`: The name of your service (default: "zenml") +* `service_version`: The version of your service (default: "1.0.0") +* `deployment_environment`: The deployment environment (default: "production") +* `endpoint`: The OTLP endpoint URL (optional) +* `headers`: Custom headers to send with log exports (optional) +* `insecure`: Whether to use an insecure connection (default: False) +* `max_queue_size`: Maximum queue size for batch processing (default: 2048) +* `schedule_delay_millis`: Export interval in milliseconds (default: 1000) +* `max_export_batch_size`: Maximum batch size for exports (default: 512) + +#### Resource Attributes + +All logs exported through the OpenTelemetry Log Store include the following resource attributes: + +* `service.name`: The configured service name +* `service.version`: The configured service version +* `service.instance.id`: A unique instance identifier +* `deployment.environment`: The deployment environment +* `zenml.pipeline_run_id`: The pipeline run UUID +* `zenml.step_id`: The step UUID (if applicable) +* `zenml.source`: The log source ("step" or "orchestrator") + +These attributes allow you to filter and aggregate logs by pipeline, step, or environment in your observability platform. + +#### Example: Using with an OTLP Collector + +```bash +zenml log-store register my_otel_logs --flavor otel \ + --service_name=my-ml-pipelines \ + --deployment_environment=production \ + --endpoint=https://otlp-collector.example.com:4318 \ + --headers='{"Authorization":"Bearer token123"}' +``` + +
ZenML Scarf
+ diff --git a/docs/book/component-guide/toc.md b/docs/book/component-guide/toc.md index 03359348d01..2376a761cbb 100644 --- a/docs/book/component-guide/toc.md +++ b/docs/book/component-guide/toc.md @@ -30,6 +30,10 @@ * [Google Cloud Storage (GCS)](artifact-stores/gcp.md) * [Azure Blob Storage](artifact-stores/azure.md) * [Develop a custom artifact store](artifact-stores/custom.md) +* [Log Stores](log-stores/README.md) + * [OpenTelemetry](log-stores/otel.md) + * [Datadog](log-stores/datadog.md) + * [Develop a custom log store](log-stores/custom.md) * [Container Registries](container-registries/README.md) * [Default Container Registry](container-registries/default.md) * [DockerHub](container-registries/dockerhub.md) From d7faedf2ddcdc77e1bc54b50b27bdd67fa174fad Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Wed, 29 Oct 2025 09:59:04 +0100 Subject: [PATCH 05/81] another checkpoint --- src/zenml/log_stores/datadog/__init__.py | 27 ++ .../log_stores/datadog/datadog_flavor.py | 96 +++++ .../log_stores/datadog/datadog_log_store.py | 359 ++++++++++++++++++ src/zenml/log_stores/default_log_store.py | 216 +++++++++++ src/zenml/log_stores/otel/__init__.py | 27 ++ src/zenml/log_stores/otel/otel_flavor.py | 91 +++++ src/zenml/log_stores/otel/otel_log_store.py | 283 ++++++++++++++ src/zenml/log_stores/utils.py | 85 +++++ src/zenml/logging/step_logging.py | 6 +- src/zenml/orchestrators/step_launcher.py | 4 +- src/zenml/orchestrators/step_runner.py | 6 +- src/zenml/pipelines/pipeline_definition.py | 2 +- 12 files changed, 1190 insertions(+), 12 deletions(-) create mode 100644 src/zenml/log_stores/datadog/__init__.py create mode 100644 src/zenml/log_stores/datadog/datadog_flavor.py create mode 100644 src/zenml/log_stores/datadog/datadog_log_store.py create mode 100644 src/zenml/log_stores/default_log_store.py create mode 100644 src/zenml/log_stores/otel/__init__.py create mode 100644 src/zenml/log_stores/otel/otel_flavor.py create mode 100644 src/zenml/log_stores/otel/otel_log_store.py create mode 100644 src/zenml/log_stores/utils.py diff --git a/src/zenml/log_stores/datadog/__init__.py b/src/zenml/log_stores/datadog/__init__.py new file mode 100644 index 00000000000..01a29e3e61e --- /dev/null +++ b/src/zenml/log_stores/datadog/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Datadog log store implementation.""" + +from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreFlavor +from zenml.log_stores.datadog.datadog_log_store import ( + DatadogLogStore, + DatadogLogStoreConfig, +) + +__all__ = [ + "DatadogLogStore", + "DatadogLogStoreConfig", + "DatadogLogStoreFlavor", +] + diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py new file mode 100644 index 00000000000..082907ecc65 --- /dev/null +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -0,0 +1,96 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Datadog log store flavor.""" + +from typing import TYPE_CHECKING, Type + +from zenml.enums import StackComponentType +from zenml.log_stores import BaseLogStore, BaseLogStoreConfig +from zenml.stack.flavor import Flavor + +if TYPE_CHECKING: + pass + + +class DatadogLogStoreFlavor(Flavor): + """Datadog log store flavor.""" + + @property + def name(self) -> str: + """Name of the flavor. + + Returns: + The name of the flavor. + """ + return "datadog" + + @property + def docs_url(self) -> str: + """URL to the flavor documentation. + + Returns: + The URL to the flavor documentation. + """ + return "https://docs.zenml.io/stack-components/log-stores/datadog" + + @property + def sdk_docs_url(self) -> str: + """URL to the SDK docs for this flavor. + + Returns: + The URL to the SDK docs for this flavor. + """ + return self.docs_url + + @property + def logo_url(self) -> str: + """URL to the flavor logo. + + Returns: + The URL to the flavor logo. + """ + return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/log_store/datadog.png" + + @property + def type(self) -> StackComponentType: + """Stack component type. + + Returns: + The stack component type. + """ + return StackComponentType.LOG_STORE + + @property + def config_class(self) -> Type[BaseLogStoreConfig]: + """Returns `DatadogLogStoreConfig` config class. + + Returns: + The config class. + """ + from zenml.log_stores.datadog.datadog_log_store import ( + DatadogLogStoreConfig, + ) + + return DatadogLogStoreConfig + + @property + def implementation_class(self) -> Type[BaseLogStore]: + """Implementation class for this flavor. + + Returns: + The implementation class. + """ + from zenml.log_stores.datadog.datadog_log_store import DatadogLogStore + + return DatadogLogStore diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py new file mode 100644 index 00000000000..a64e5ff1d29 --- /dev/null +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -0,0 +1,359 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Datadog log store implementation.""" + +from datetime import datetime +from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast + +from pydantic import Field, SecretStr + +from zenml.log_stores.otel.otel_log_store import ( + OtelLogStore, + OtelLogStoreConfig, +) +from zenml.logger import get_logger + +if TYPE_CHECKING: + from opentelemetry.sdk._logs.export import LogExporter + + from zenml.logging.step_logging import LogEntry + from zenml.models import LogsResponse + +logger = get_logger(__name__) + + +class DatadogLogStoreConfig(OtelLogStoreConfig): + """Configuration for Datadog log store. + + This extends OtelLogStoreConfig with Datadog-specific settings. + + Attributes: + api_key: Datadog API key for log ingestion. + site: Datadog site (e.g., "datadoghq.com", "datadoghq.eu"). + additional_tags: Additional tags to add to all logs. + """ + + api_key: SecretStr = Field( + description="Datadog API key for log ingestion", + ) + site: str = Field( + default="datadoghq.com", + description="Datadog site (e.g., datadoghq.com, datadoghq.eu)", + ) + additional_tags: Dict[str, str] = Field( + default_factory=dict, + description="Additional tags to add to all logs", + ) + + +class DatadogLogExporter: + """Custom log exporter that sends logs to Datadog's HTTP intake API. + + This exporter transforms OpenTelemetry log records into Datadog's format + and sends them via HTTP POST without requiring the Datadog SDK. + """ + + def __init__( + self, + api_key: str, + site: str = "datadoghq.com", + additional_tags: Optional[Dict[str, str]] = None, + ): + """Initialize the Datadog log exporter. + + Args: + api_key: Datadog API key. + site: Datadog site domain. + additional_tags: Additional tags to add to all logs. + """ + self.api_key = api_key + self.endpoint = f"https://http-intake.logs.{site}/v1/input" + self.headers = { + "DD-API-KEY": api_key, + "Content-Type": "application/json", + } + self.additional_tags = additional_tags or {} + + def export(self, batch: List[Any]) -> Any: + """Export a batch of log records to Datadog. + + Args: + batch: List of LogData objects from OpenTelemetry. + + Returns: + LogExportResult indicating success or failure. + """ + try: + import requests + from opentelemetry.sdk._logs.export import LogExportResult + except ImportError: + logger.error( + "Required packages not installed. Install with: " + "pip install requests opentelemetry-sdk" + ) + from opentelemetry.sdk._logs.export import LogExportResult + + return LogExportResult.FAILURE + + if not batch: + return LogExportResult.SUCCESS + + logs = [] + for log_data in batch: + log_record = log_data.log_record + + # Extract resource attributes + resource_attrs = {} + if log_record.resource: + resource_attrs = dict(log_record.resource.attributes) + + # Extract log attributes + log_attrs = {} + if log_record.attributes: + log_attrs = dict(log_record.attributes) + + # Combine attributes with additional tags + all_attrs = {**resource_attrs, **log_attrs, **self.additional_tags} + + # Build Datadog log entry + log_entry = { + "message": str(log_record.body), + "ddsource": "zenml", + "service": resource_attrs.get("service.name", "zenml"), + "hostname": resource_attrs.get( + "service.instance.id", "unknown" + ), + } + + # Add severity if available + if log_record.severity_text: + log_entry["status"] = log_record.severity_text.lower() + + # Add timestamp if available (convert from nanoseconds to milliseconds) + if log_record.timestamp: + log_entry["timestamp"] = int(log_record.timestamp / 1_000_000) + + # Add all attributes as tags + if all_attrs: + # Convert dict to Datadog tags format: key:value + tags = [f"{k}:{v}" for k, v in all_attrs.items()] + log_entry["ddtags"] = ",".join(tags) + + logs.append(log_entry) + + try: + response = requests.post( + self.endpoint, + headers=self.headers, + json=logs, + timeout=10, + ) + + if response.status_code in [200, 202]: + logger.debug(f"Successfully sent {len(logs)} logs to Datadog") + return LogExportResult.SUCCESS + else: + logger.warning( + f"Datadog rejected logs: {response.status_code} - {response.text[:200]}" + ) + return LogExportResult.FAILURE + except Exception as e: + logger.error(f"Failed to export logs to Datadog: {e}") + return LogExportResult.FAILURE + + def shutdown(self) -> None: + """Shutdown the exporter.""" + pass + + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Force flush any buffered logs. + + Args: + timeout_millis: Timeout in milliseconds. + + Returns: + True if successful. + """ + return True + + +class DatadogLogStore(OtelLogStore): + """Log store that exports logs to Datadog. + + This implementation extends OtelLogStore and configures it to send logs + to Datadog's HTTP intake API. Logs are sent with appropriate tags including + pipeline_run_id, step_id, and source for easy filtering on Datadog. + """ + + @property + def config(self) -> DatadogLogStoreConfig: + """Returns the configuration of the Datadog log store. + + Returns: + The configuration. + """ + return cast(DatadogLogStoreConfig, self._config) + + def get_exporter(self) -> "LogExporter": + """Get the Datadog log exporter. + + Returns: + DatadogLogExporter configured with API key and site. + """ + return DatadogLogExporter( + api_key=self.config.api_key.get_secret_value(), + site=self.config.site, + additional_tags=self.config.additional_tags, + ) + + def fetch( + self, + logs_model: "LogsResponse", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + limit: int = 20000, + ) -> List["LogEntry"]: + """Fetch logs from Datadog's API. + + This method queries Datadog's Logs API to retrieve logs for the + specified pipeline run and step. It uses the HTTP API without + requiring the Datadog SDK. + + Args: + logs_model: The logs model containing run and step metadata. + start_time: Filter logs after this time. + end_time: Filter logs before this time. + limit: Maximum number of log entries to return. + + Returns: + List of log entries from Datadog. + """ + try: + import requests + except ImportError: + logger.error( + "requests package not installed. Install with: pip install requests" + ) + return [] + + from zenml.logging.step_logging import LogEntry + + # Build query + query_parts = [ + f"service:{self.config.service_name}", + f"zenml.pipeline_run_id:{logs_model.pipeline_run_id}", + ] + + if logs_model.step_run_id: + query_parts.append(f"zenml.step_id:{logs_model.step_run_id}") + + if logs_model.source: + query_parts.append(f"zenml.source:{logs_model.source}") + + query = " ".join(query_parts) + + # Build API request + api_endpoint = ( + f"https://api.{self.config.site}/api/v2/logs/events/search" + ) + headers = { + "DD-API-KEY": self.config.api_key.get_secret_value(), + "Content-Type": "application/json", + } + + body: Dict[str, Any] = { + "filter": { + "query": query, + }, + "page": { + "limit": min(limit, 1000), # Datadog API limit + }, + "sort": "timestamp", + } + + # Add time filters if provided + if start_time: + body["filter"]["from"] = start_time.isoformat() + if end_time: + body["filter"]["to"] = end_time.isoformat() + + try: + response = requests.post( + api_endpoint, + headers=headers, + json=body, + timeout=30, + ) + + if response.status_code != 200: + logger.error( + f"Failed to fetch logs from Datadog: {response.status_code} - {response.text[:200]}" + ) + return [] + + data = response.json() + log_entries = [] + + for log in data.get("data", []): + attributes = log.get("attributes", {}) + + # Parse log entry + entry = LogEntry( + message=attributes.get("message", ""), + level=self._parse_log_level(attributes.get("status")), + timestamp=datetime.fromisoformat( + attributes["timestamp"].replace("Z", "+00:00") + ) + if "timestamp" in attributes + else None, + ) + + log_entries.append(entry) + + logger.debug(f"Fetched {len(log_entries)} logs from Datadog") + return log_entries + + except Exception as e: + logger.error(f"Error fetching logs from Datadog: {e}") + return [] + + def _parse_log_level( + self, status: Optional[str] + ) -> Optional["LoggingLevels"]: + """Parse Datadog log status to ZenML log level. + + Args: + status: Datadog log status string. + + Returns: + ZenML LoggingLevels enum value. + """ + from zenml.enums import LoggingLevels + + if not status: + return None + + status_upper = status.upper() + if status_upper in ["DEBUG", "TRACE"]: + return LoggingLevels.DEBUG + elif status_upper in ["INFO", "INFORMATION"]: + return LoggingLevels.INFO + elif status_upper in ["WARN", "WARNING"]: + return LoggingLevels.WARN + elif status_upper == "ERROR": + return LoggingLevels.ERROR + elif status_upper in ["CRITICAL", "FATAL", "EMERGENCY"]: + return LoggingLevels.CRITICAL + else: + return LoggingLevels.INFO diff --git a/src/zenml/log_stores/default_log_store.py b/src/zenml/log_stores/default_log_store.py new file mode 100644 index 00000000000..30e774bf1e0 --- /dev/null +++ b/src/zenml/log_stores/default_log_store.py @@ -0,0 +1,216 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Default log store implementation.""" + +import logging +from datetime import datetime +from typing import TYPE_CHECKING, Any, List, Optional, cast +from uuid import UUID + +from zenml.client import Client +from zenml.log_stores.base_log_store import BaseLogStore, BaseLogStoreConfig +from zenml.logger import get_logger, logging_handlers + +if TYPE_CHECKING: + from zenml.artifact_stores import BaseArtifactStore + from zenml.logging.step_logging import ( + ArtifactStoreHandler, + LogEntry, + PipelineLogsStorage, + ) + from zenml.models import LogsResponse + +logger = get_logger(__name__) + + +class DefaultLogStoreConfig(BaseLogStoreConfig): + """Configuration for the default log store. + + This log store saves logs to the artifact store, which is the default + and backward-compatible approach. + """ + + +class DefaultLogStore(BaseLogStore): + """Log store that saves logs to the artifact store. + + This implementation uses the artifact store as the backend for log storage, + maintaining backward compatibility with existing ZenML behavior. Logs are + written to the artifact store using a background thread and queue for + efficient batching. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Initialize the default log store. + + Args: + *args: Positional arguments for the base class. + **kwargs: Keyword arguments for the base class. + """ + super().__init__(*args, **kwargs) + self.storage: Optional["PipelineLogsStorage"] = None + self.handler: Optional["ArtifactStoreHandler"] = None + self._artifact_store: Optional["BaseArtifactStore"] = None + self._original_root_level: Optional[int] = None + + @property + def config(self) -> DefaultLogStoreConfig: + """Returns the configuration of the default log store. + + Returns: + The configuration. + """ + return cast(DefaultLogStoreConfig, self._config) + + def activate( + self, + pipeline_run_id: UUID, + step_id: Optional[UUID] = None, + source: str = "step", + ) -> None: + """Activate log collection to the artifact store. + + Args: + pipeline_run_id: The ID of the pipeline run. + step_id: The ID of the step (if collecting step logs). + source: The source of the logs (e.g., "step", "orchestrator"). + """ + from zenml.logging.step_logging import ( + ArtifactStoreHandler, + PipelineLogsStorage, + prepare_logs_uri, + ) + + # Get the artifact store from the active stack + client = Client() + self._artifact_store = client.active_stack.artifact_store + + # Prepare logs URI + step_name = None + if step_id: + try: + step_run = client.get_pipeline_run_step(step_id) + step_name = step_run.name + except Exception: + pass + + logs_uri = prepare_logs_uri( + artifact_store=self._artifact_store, + step_name=step_name, + ) + + # Create storage and handler + self.storage = PipelineLogsStorage( + logs_uri=logs_uri, + artifact_store=self._artifact_store, + ) + self.handler = ArtifactStoreHandler(self.storage) + + # Add handler to root logger + root_logger = logging.getLogger() + root_logger.addHandler(self.handler) + + # Set root logger level to minimum of all handlers + self._original_root_level = root_logger.level + handler_levels = [handler.level for handler in root_logger.handlers] + min_level = min(handler_levels) + if min_level < root_logger.level: + root_logger.setLevel(min_level) + + # Add to context variables for print capture + logging_handlers.add(self.handler) + + logger.debug( + f"DefaultLogStore activated for {source} " + f"(pipeline_run={pipeline_run_id}, step={step_id})" + ) + + def deactivate(self) -> None: + """Deactivate log collection and flush remaining logs.""" + if not self.handler: + return + + # Remove handler from root logger + root_logger = logging.getLogger() + if self.handler in root_logger.handlers: + root_logger.removeHandler(self.handler) + + # Restore original root logger level + if self._original_root_level is not None: + root_logger.setLevel(self._original_root_level) + + # Remove from context variables + logging_handlers.remove(self.handler) + + # Shutdown storage thread (flushes and merges logs) + if self.storage: + try: + self.storage._shutdown_log_storage_thread() + except Exception as e: + logger.warning(f"Error shutting down log storage: {e}") + + logger.debug("DefaultLogStore deactivated") + + def fetch( + self, + logs_model: "LogsResponse", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + limit: int = 20000, + ) -> List["LogEntry"]: + """Fetch logs from the artifact store. + + Args: + logs_model: The logs model containing uri and artifact_store_id. + start_time: Filter logs after this time. + end_time: Filter logs before this time. + limit: Maximum number of log entries to return. + + Returns: + List of log entries from the artifact store. + + Raises: + ValueError: If logs_model.uri is not provided. + """ + from zenml.logging.step_logging import fetch_log_records + + if not logs_model.uri: + raise ValueError( + "logs_model.uri is required for DefaultLogStore.fetch()" + ) + + if not logs_model.artifact_store_id: + raise ValueError( + "logs_model.artifact_store_id is required for DefaultLogStore.fetch()" + ) + + client = Client() + log_entries = fetch_log_records( + zen_store=client.zen_store, + artifact_store_id=logs_model.artifact_store_id, + logs_uri=logs_model.uri, + ) + + if start_time or end_time: + filtered_entries = [] + for entry in log_entries: + if entry.timestamp: + if start_time and entry.timestamp < start_time: + continue + if end_time and entry.timestamp > end_time: + continue + filtered_entries.append(entry) + log_entries = filtered_entries + + return log_entries[:limit] diff --git a/src/zenml/log_stores/otel/__init__.py b/src/zenml/log_stores/otel/__init__.py new file mode 100644 index 00000000000..560a7220a1d --- /dev/null +++ b/src/zenml/log_stores/otel/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""OpenTelemetry log store implementation.""" + +from zenml.log_stores.otel.otel_flavor import OtelLogStoreFlavor +from zenml.log_stores.otel.otel_log_store import ( + OtelLogStore, + OtelLogStoreConfig, +) + +__all__ = [ + "OtelLogStore", + "OtelLogStoreConfig", + "OtelLogStoreFlavor", +] + diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py new file mode 100644 index 00000000000..ecb4a972ead --- /dev/null +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -0,0 +1,91 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""OpenTelemetry log store flavor.""" + +from typing import Type + +from zenml.enums import StackComponentType +from zenml.log_stores import BaseLogStore, BaseLogStoreConfig +from zenml.stack.flavor import Flavor + + +class OtelLogStoreFlavor(Flavor): + """OpenTelemetry log store flavor.""" + + @property + def name(self) -> str: + """Name of the flavor. + + Returns: + The name of the flavor. + """ + return "otel" + + @property + def docs_url(self) -> str: + """URL to the flavor documentation. + + Returns: + The URL to the flavor documentation. + """ + return "https://docs.zenml.io/stack-components/log-stores/otel" + + @property + def sdk_docs_url(self) -> str: + """URL to the SDK docs for this flavor. + + Returns: + The URL to the SDK docs for this flavor. + """ + return self.docs_url + + @property + def logo_url(self) -> str: + """URL to the flavor logo. + + Returns: + The URL to the flavor logo. + """ + return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/log_store/otel.png" + + @property + def type(self) -> StackComponentType: + """Stack component type. + + Returns: + The stack component type. + """ + return StackComponentType.LOG_STORE + + @property + def config_class(self) -> Type[BaseLogStoreConfig]: + """Returns `OtelLogStoreConfig` config class. + + Returns: + The config class. + """ + from zenml.log_stores.otel.otel_log_store import OtelLogStoreConfig + + return OtelLogStoreConfig + + @property + def implementation_class(self) -> Type[BaseLogStore]: + """Implementation class for this flavor. + + Returns: + The implementation class. + """ + from zenml.log_stores.otel.otel_log_store import OtelLogStore + + return OtelLogStore diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py new file mode 100644 index 00000000000..ec9ac0a9955 --- /dev/null +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -0,0 +1,283 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""OpenTelemetry log store implementation.""" + +import logging +from abc import abstractmethod +from datetime import datetime +from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +from uuid import UUID + +from pydantic import Field + +from zenml.log_stores.base_log_store import BaseLogStore, BaseLogStoreConfig +from zenml.logger import get_logger, get_storage_log_level, logging_handlers +from zenml.utils.string_utils import random_str + +if TYPE_CHECKING: + from opentelemetry.sdk._logs import LoggerProvider + from opentelemetry.sdk._logs.export import LogExporter + + from zenml.logging.step_logging import LogEntry + from zenml.models import LogsResponse + +logger = get_logger(__name__) + + +class OtelLogStoreConfig(BaseLogStoreConfig): + """Configuration for OpenTelemetry log store. + + Attributes: + service_name: Name of the service (defaults to "zenml"). + service_version: Version of the service. + deployment_environment: Deployment environment (e.g., "production"). + max_queue_size: Maximum queue size for batch processor. + schedule_delay_millis: Delay between batch exports in milliseconds. + max_export_batch_size: Maximum batch size for exports. + endpoint: Optional OTLP endpoint URL (for HTTP/gRPC exporters). + headers: Optional headers for OTLP exporter. + insecure: Whether to use insecure connection for OTLP. + """ + + service_name: str = Field( + default="zenml", + description="Name of the service for telemetry", + ) + service_version: str = Field( + default="1.0.0", + description="Version of the service", + ) + deployment_environment: str = Field( + default="production", + description="Deployment environment", + ) + max_queue_size: int = Field( + default=2048, + description="Maximum queue size for batch log processor", + ) + schedule_delay_millis: int = Field( + default=1000, + description="Export interval in milliseconds", + ) + max_export_batch_size: int = Field( + default=512, + description="Maximum batch size for exports", + ) + endpoint: Optional[str] = Field( + default=None, + description="OTLP endpoint URL", + ) + headers: Dict[str, str] = Field( + default_factory=dict, + description="Headers for OTLP exporter", + ) + insecure: bool = Field( + default=False, + description="Whether to use insecure connection", + ) + + +class OtelLogStore(BaseLogStore): + """Log store that exports logs using OpenTelemetry. + + This implementation uses the OpenTelemetry SDK to collect and export logs + to various backends. It uses a BatchLogRecordProcessor for efficient + background processing. + + Subclasses should implement `get_exporter()` to provide the specific + log exporter for their backend (e.g., console, OTLP, Datadog). + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Initialize the OpenTelemetry log store. + + Args: + *args: Positional arguments for the base class. + **kwargs: Keyword arguments for the base class. + """ + super().__init__(*args, **kwargs) + self.logger_provider: Optional["LoggerProvider"] = None + self.handler: Optional[logging.Handler] = None + self._original_root_level: Optional[int] = None + self._pipeline_run_id: Optional[UUID] = None + self._step_id: Optional[UUID] = None + self._source: Optional[str] = None + + @property + def config(self) -> OtelLogStoreConfig: + """Returns the configuration of the OTel log store. + + Returns: + The configuration. + """ + return cast(OtelLogStoreConfig, self._config) + + @abstractmethod + def get_exporter(self) -> "LogExporter": + """Get the log exporter for this log store. + + Subclasses must implement this method to provide the appropriate + exporter for their backend (e.g., ConsoleLogExporter, OTLPLogExporter). + + Returns: + The log exporter instance. + """ + + def activate( + self, + pipeline_run_id: UUID, + step_id: Optional[UUID] = None, + source: str = "step", + ) -> None: + """Activate log collection with OpenTelemetry. + + Args: + pipeline_run_id: The ID of the pipeline run. + step_id: The ID of the step (if collecting step logs). + source: The source of the logs (e.g., "step", "orchestrator"). + """ + try: + from opentelemetry.sdk._logs import LoggerProvider + from opentelemetry.sdk._logs.export import BatchLogRecordProcessor + from opentelemetry.sdk.resources import Resource + except ImportError: + logger.error( + "OpenTelemetry SDK not installed. Install with: " + "pip install opentelemetry-sdk opentelemetry-exporter-otlp" + ) + return + + # Store metadata + self._pipeline_run_id = pipeline_run_id + self._step_id = step_id + self._source = source + + # Create resource with service information and ZenML metadata + resource_attributes = { + "service.name": self.config.service_name, + "service.version": self.config.service_version, + "service.instance.id": random_str(8), + "deployment.environment": self.config.deployment_environment, + "zenml.pipeline_run_id": str(pipeline_run_id), + "zenml.source": source, + } + if step_id: + resource_attributes["zenml.step_id"] = str(step_id) + + otel_resource = Resource.create(resource_attributes) + + # Create logger provider + self.logger_provider = LoggerProvider(resource=otel_resource) + + # Get exporter from subclass + exporter = self.get_exporter() + + # Create batch processor for efficient background processing + processor = BatchLogRecordProcessor( + exporter, + max_queue_size=self.config.max_queue_size, + schedule_delay_millis=self.config.schedule_delay_millis, + max_export_batch_size=self.config.max_export_batch_size, + ) + self.logger_provider.add_log_record_processor(processor) + + # Create handler for Python logging integration + try: + from opentelemetry.sdk._logs import LoggingHandler + + self.handler = LoggingHandler( + level=get_storage_log_level().value, + logger_provider=self.logger_provider, + ) + except ImportError: + logger.error( + "Failed to import LoggingHandler from OpenTelemetry SDK" + ) + return + + # Add handler to root logger + root_logger = logging.getLogger() + root_logger.addHandler(self.handler) + + # Set root logger level to minimum of all handlers + self._original_root_level = root_logger.level + handler_levels = [handler.level for handler in root_logger.handlers] + min_level = min(handler_levels) + if min_level < root_logger.level: + root_logger.setLevel(min_level) + + # Add to context variables for print capture + logging_handlers.add(self.handler) + + logger.debug( + f"OtelLogStore activated for {source} " + f"(pipeline_run={pipeline_run_id}, step={step_id})" + ) + + def deactivate(self) -> None: + """Deactivate log collection and flush remaining logs.""" + if not self.handler: + return + + # Remove handler from root logger + root_logger = logging.getLogger() + if self.handler in root_logger.handlers: + root_logger.removeHandler(self.handler) + + # Restore original root logger level + if self._original_root_level is not None: + root_logger.setLevel(self._original_root_level) + + # Remove from context variables + logging_handlers.remove(self.handler) + + # Flush and shutdown logger provider + if self.logger_provider: + try: + self.logger_provider.force_flush() + self.logger_provider.shutdown() + except Exception as e: + logger.warning( + f"Error shutting down OTel logger provider: {e}" + ) + + logger.debug("OtelLogStore deactivated") + + def fetch( + self, + logs_model: "LogsResponse", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + limit: int = 20000, + ) -> List["LogEntry"]: + """Fetch logs from the OpenTelemetry backend. + + This method should be overridden by subclasses to implement + backend-specific log retrieval. The base implementation returns + an empty list. + + Args: + logs_model: The logs model containing run and step metadata. + start_time: Filter logs after this time. + end_time: Filter logs before this time. + limit: Maximum number of log entries to return. + + Returns: + List of log entries from the backend. + """ + logger.warning( + "OtelLogStore.fetch() not implemented. " + "Subclasses should override this method to query their backend." + ) + return [] diff --git a/src/zenml/log_stores/utils.py b/src/zenml/log_stores/utils.py new file mode 100644 index 00000000000..35c3ad0579e --- /dev/null +++ b/src/zenml/log_stores/utils.py @@ -0,0 +1,85 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Utility functions for working with log stores.""" + +from datetime import datetime +from typing import TYPE_CHECKING, List, Optional + +if TYPE_CHECKING: + from zenml.logging.step_logging import LogEntry + from zenml.models import LogsResponse + from zenml.zen_stores.base_zen_store import BaseZenStore + + +def fetch_logs( + logs: "LogsResponse", + zen_store: "BaseZenStore", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + limit: int = 20000, +) -> List["LogEntry"]: + """Fetch logs using the appropriate log store. + + This function determines which log store to use based on the log_store_id + in the logs record. If log_store_id is present, it loads that log store. + Otherwise, it falls back to DefaultLogStore. + + Args: + logs: The logs model containing metadata and log_store_id. + zen_store: The zen store to fetch log store component from. + start_time: Filter logs after this time. + end_time: Filter logs before this time. + limit: Maximum number of log entries to return. + + Returns: + List of log entries. + """ + from zenml.enums import StackComponentType + from zenml.stack import StackComponent + + if logs.log_store_id: + log_store_model = zen_store.get_stack_component(logs.log_store_id) + log_store = StackComponent.from_model(log_store_model) + else: + from zenml.log_stores.default_log_store import ( + DefaultLogStore, + DefaultLogStoreConfig, + ) + from zenml.utils.time_utils import utc_now + + if not logs.artifact_store_id: + return [] + + artifact_store_model = zen_store.get_stack_component( + logs.artifact_store_id + ) + + log_store = DefaultLogStore( + name="default_log_store_fallback", + id=artifact_store_model.id, + config=DefaultLogStoreConfig(), + flavor="default", + type=StackComponentType.LOG_STORE, + user=artifact_store_model.user, + workspace=artifact_store_model.workspace, + created=utc_now(), + updated=utc_now(), + ) + + return log_store.fetch( + logs_model=logs, + start_time=start_time, + end_time=end_time, + limit=limit, + ) diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/step_logging.py index 31b835bda65..0382b578d2d 100644 --- a/src/zenml/logging/step_logging.py +++ b/src/zenml/logging/step_logging.py @@ -739,11 +739,9 @@ def __init__(self, source: str = "step") -> None: self.log_request_id = uuid4() - try: + if Client().active_stack.log_store: self.log_store = Client().active_stack.log_store - except AttributeError: - from zenml.log_stores.default_log_store import DefaultLogStore - + else: self.log_store = DefaultLogStore() def create_log_request(self) -> "LogsRequest": diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 1ec740df8f1..6eaa957456b 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -272,9 +272,7 @@ def launch(self) -> None: from zenml.enums import LoggableEntityType logs_context = step_logging.LoggingContext( - entity_type=LoggableEntityType.PIPELINE_RUN, - entity_id=pipeline_run.id, - source="execution", + source="step", ) # type: ignore[assignment] logs_model = logs_context.create_log_request() diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index 058de4a09e7..ef0790b9b03 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -38,7 +38,7 @@ handle_bool_env_var, ) from zenml.deployers.server import runtime -from zenml.enums import ArtifactSaveType, LoggableEntityType +from zenml.enums import ArtifactSaveType from zenml.exceptions import StepInterfaceError from zenml.hooks.hook_validators import load_and_run_hook from zenml.logger import get_logger @@ -151,9 +151,7 @@ def run( logs_context = nullcontext() if step_logging_enabled and not redirected.get(): if step_run.logs: - logs_context = LoggingContext( # type: ignore[assignment] - entity_type=LoggableEntityType.STEP_RUN, - entity_id=step_run.id, + logs_context = LoggingContext( source="step", ) else: diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index b9ff9e1736f..4f4984f9f4d 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -962,7 +962,7 @@ def _run( if logging_enabled: logs_context = LoggingContext() - logs_request = logs_context.generate_logs_request() + logs_request = logs_context.create_log_request() with logs_context: snapshot = self._create_snapshot(**self._run_args) From 96bf1f854e172dd8fd8f41a48468f2d9bc4dfe9c Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 30 Oct 2025 11:00:37 +0100 Subject: [PATCH 06/81] new checkpoint --- src/zenml/log_stores/__init__.py | 38 +- src/zenml/log_stores/base_log_store.py | 47 +- src/zenml/log_stores/datadog/__init__.py | 15 +- src/zenml/log_stores/default/__init__.py | 14 + .../log_stores/default/default_log_store.py | 831 ++++++++++++++++++ .../default/default_log_store_flavor.py | 101 +++ src/zenml/log_stores/default_log_store.py | 216 ----- src/zenml/log_stores/otel/__init__.py | 15 +- src/zenml/log_stores/utils.py | 2 +- src/zenml/logging/step_logging.py | 675 +------------- src/zenml/orchestrators/step_launcher.py | 12 +- src/zenml/orchestrators/step_runner.py | 4 +- src/zenml/pipelines/pipeline_definition.py | 4 +- src/zenml/stack/flavor_registry.py | 7 +- 14 files changed, 1068 insertions(+), 913 deletions(-) create mode 100644 src/zenml/log_stores/default/__init__.py create mode 100644 src/zenml/log_stores/default/default_log_store.py create mode 100644 src/zenml/log_stores/default/default_log_store_flavor.py delete mode 100644 src/zenml/log_stores/default_log_store.py diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index a59f87e105d..218d3b7c19d 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -16,16 +16,48 @@ ZenML log stores provide different backends for storing pipeline and step logs. """ -from zenml.log_stores.base_log_store import BaseLogStore, BaseLogStoreConfig -from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreFlavor +# Base log store +from zenml.log_stores.base_log_store import ( + BaseLogStore, + BaseLogStoreConfig, + BaseLogStoreFlavor, +) + +# Default log store +from zenml.log_stores.default.default_log_store import ( + DefaultLogStore, + DefaultLogStoreConfig, +) +from zenml.log_stores.default.default_log_store_flavor import ( + DefaultLogStoreFlavor, +) + +# OpenTelemetry log store +from zenml.log_stores.otel.otel_log_store import OtelLogStore, OtelLogStoreConfig from zenml.log_stores.otel.otel_flavor import OtelLogStoreFlavor + +# Datadog log store +from zenml.log_stores.datadog.datadog_log_store import ( + DatadogLogStore, + DatadogLogStoreConfig, +) +from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreFlavor + +# Utils from zenml.log_stores.utils import fetch_logs __all__ = [ "BaseLogStore", "BaseLogStoreConfig", + "BaseLogStoreFlavor", + "DatadogLogStore", + "DatadogLogStoreConfig", "DatadogLogStoreFlavor", + "DefaultLogStore", + "DefaultLogStoreConfig", + "DefaultLogStoreFlavor", + "OtelLogStore", + "OtelLogStoreConfig", "OtelLogStoreFlavor", "fetch_logs", ] - diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index 95ce11ed231..c906b118cf7 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -15,13 +15,14 @@ from abc import abstractmethod from datetime import datetime -from typing import TYPE_CHECKING, List, Optional, cast +from typing import TYPE_CHECKING, List, Optional, Type, cast -from zenml.stack import StackComponent, StackComponentConfig +from zenml.enums import StackComponentType +from zenml.stack import Flavor, StackComponent, StackComponentConfig if TYPE_CHECKING: from zenml.logging.step_logging import LogEntry - from zenml.models import LogsResponse + from zenml.models import LogsRequest, LogsResponse class BaseLogStoreConfig(StackComponentConfig): @@ -46,10 +47,7 @@ def config(self) -> BaseLogStoreConfig: return cast(BaseLogStoreConfig, self._config) @abstractmethod - def activate( - self, - source: str = "step", - ) -> None: + def activate(self, log_request: "LogsRequest") -> None: """Activate the log store for log collection. This method is called when ZenML needs to start collecting and storing @@ -57,9 +55,7 @@ def activate( handlers, threads, or connections. Args: - pipeline_run_id: The ID of the pipeline run. - step_id: The ID of the step (if collecting step logs). - source: The source of the logs (e.g., "step", "orchestrator"). + log_request: The log request model. """ @abstractmethod @@ -100,3 +96,34 @@ def fetch( Returns: List of log entries matching the query. """ + + +class BaseLogStoreFlavor(Flavor): + """Base class for all ZenML log store flavors.""" + + @property + def type(self) -> StackComponentType: + """Type of the flavor. + + Returns: + The type of the flavor. + """ + return StackComponentType.LOG_STORE + + @property + def config_class(self) -> Type[BaseLogStoreConfig]: + """Config class for the base log store flavor. + + Returns: + The config class. + """ + return BaseLogStoreConfig + + @property + @abstractmethod + def implementation_class(self) -> Type["BaseLogStore"]: + """Implementation class for the base log store flavor. + + Returns: + The implementation class. + """ diff --git a/src/zenml/log_stores/datadog/__init__.py b/src/zenml/log_stores/datadog/__init__.py index 01a29e3e61e..7a764c79391 100644 --- a/src/zenml/log_stores/datadog/__init__.py +++ b/src/zenml/log_stores/datadog/__init__.py @@ -11,17 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Datadog log store implementation.""" - -from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreFlavor -from zenml.log_stores.datadog.datadog_log_store import ( - DatadogLogStore, - DatadogLogStoreConfig, -) - -__all__ = [ - "DatadogLogStore", - "DatadogLogStoreConfig", - "DatadogLogStoreFlavor", -] - +"""Datadog log store implementation.""" \ No newline at end of file diff --git a/src/zenml/log_stores/default/__init__.py b/src/zenml/log_stores/default/__init__.py new file mode 100644 index 00000000000..2d8058ed404 --- /dev/null +++ b/src/zenml/log_stores/default/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Default log store implementation.""" \ No newline at end of file diff --git a/src/zenml/log_stores/default/default_log_store.py b/src/zenml/log_stores/default/default_log_store.py new file mode 100644 index 00000000000..5af6b3757ef --- /dev/null +++ b/src/zenml/log_stores/default/default_log_store.py @@ -0,0 +1,831 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Default log store implementation.""" + +import asyncio +import logging +import os +import queue +import re +import threading +import time +from datetime import datetime +from typing import ( + TYPE_CHECKING, + Any, + Iterator, + List, + Optional, + Union, + cast, +) +from uuid import UUID, uuid4 + +from zenml.artifact_stores import BaseArtifactStore +from zenml.artifacts.utils import _load_artifact_store +from zenml.client import Client +from zenml.constants import ( + LOGS_MERGE_INTERVAL_SECONDS, + LOGS_STORAGE_MAX_QUEUE_SIZE, + LOGS_STORAGE_QUEUE_TIMEOUT, + LOGS_WRITE_INTERVAL_SECONDS, +) +from zenml.enums import LoggingLevels +from zenml.exceptions import DoesNotExistException +from zenml.log_stores.base_log_store import BaseLogStore +from zenml.log_stores.default.default_log_store_flavor import ( + DefaultLogStoreConfig, +) +from zenml.logger import ( + get_logger, + get_storage_log_level, + logging_handlers, +) +from zenml.logging.step_logging import ( + DEFAULT_MESSAGE_SIZE, + MAX_ENTRIES_PER_REQUEST, + LogEntry, +) +from zenml.models import ( + LogsRequest, + LogsResponse, +) +from zenml.utils.io_utils import sanitize_remote_path +from zenml.utils.time_utils import utc_now +from zenml.zen_stores.base_zen_store import BaseZenStore + +if TYPE_CHECKING: + from zenml.artifact_stores import BaseArtifactStore + from zenml.logging.step_logging import ( + ArtifactStoreHandler, + LogEntry, + PipelineLogsStorage, + ) + +ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") + +logger = get_logger(__name__) + + +LOGS_EXTENSION = ".log" + + +def remove_ansi_escape_codes(text: str) -> str: + """Auxiliary function to remove ANSI escape codes from a given string. + + Args: + text: the input string + + Returns: + the version of the input string where the escape codes are removed. + """ + return ansi_escape.sub("", text) + + +def prepare_logs_uri( + artifact_store: "BaseArtifactStore", + log_id: UUID, +) -> str: + """Generates and prepares a URI for the log file or folder for a step. + + Args: + artifact_store: The artifact store on which the artifact will be stored. + log_id: The ID of the logs entity + + Returns: + The URI of the log storage (file or folder). + """ + logs_base_uri = os.path.join(artifact_store.path, "logs") + + if not artifact_store.exists(logs_base_uri): + artifact_store.makedirs(logs_base_uri) + + if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + logs_uri = os.path.join(logs_base_uri, log_id) + if artifact_store.exists(logs_uri): + logger.warning( + f"Logs directory {logs_uri} already exists! Removing old log directory..." + ) + artifact_store.rmtree(logs_uri) + + artifact_store.makedirs(logs_uri) + else: + logs_uri = os.path.join(logs_base_uri, f"{log_id}{LOGS_EXTENSION}") + if artifact_store.exists(logs_uri): + logger.warning( + f"Logs file {logs_uri} already exists! Removing old log file..." + ) + artifact_store.remove(logs_uri) + + return sanitize_remote_path(logs_uri) + + +def fetch_log_records( + zen_store: "BaseZenStore", + artifact_store_id: Union[str, UUID], + logs_uri: str, +) -> List[LogEntry]: + """Fetches log entries. + + Args: + zen_store: The store in which the artifact is stored. + artifact_store_id: The ID of the artifact store. + logs_uri: The URI of the artifact (file or directory). + + Returns: + List of log entries. + """ + log_entries = [] + + for line in _stream_logs_line_by_line( + zen_store, artifact_store_id, logs_uri + ): + if log_entry := parse_log_entry(line): + log_entries.append(log_entry) + + if len(log_entries) >= MAX_ENTRIES_PER_REQUEST: + break + + return log_entries + + +def _stream_logs_line_by_line( + zen_store: "BaseZenStore", + artifact_store_id: Union[str, UUID], + logs_uri: str, +) -> Iterator[str]: + """Stream logs line by line without loading the entire file into memory. + + This generator yields log lines one by one, handling both single files + and directories with multiple log files. + + Args: + zen_store: The store in which the artifact is stored. + artifact_store_id: The ID of the artifact store. + logs_uri: The URI of the log file or directory. + + Yields: + Individual log lines as strings. + + Raises: + DoesNotExistException: If the artifact does not exist in the artifact store. + """ + artifact_store = _load_artifact_store(artifact_store_id, zen_store) + + try: + if not artifact_store.isdir(logs_uri): + # Single file case + with artifact_store.open(logs_uri, "r") as file: + for line in file: + yield line.rstrip("\n\r") + else: + # Directory case - may contain multiple log files + files = artifact_store.listdir(logs_uri) + if not files: + raise DoesNotExistException( + f"Folder '{logs_uri}' is empty in artifact store " + f"'{artifact_store.name}'." + ) + + # Sort files to read them in order + files.sort() + + for file in files: + file_path = os.path.join(logs_uri, str(file)) + with artifact_store.open(file_path, "r") as f: + for line in f: + yield line.rstrip("\n\r") + finally: + artifact_store.cleanup() + + +def parse_log_entry(log_line: str) -> Optional[LogEntry]: + """Parse a single log entry into a LogEntry object. + + Handles two formats: + 1. JSON format: {"timestamp": "...", "level": "...", "message": "...", "location": "..."} + Uses Pydantic's model_validate_json for automatic parsing and validation. + 2. Plain text: Any other text (defaults to INFO level) + + Args: + log_line: A single log line to parse + + Returns: + LogEntry object. For JSON logs, all fields are validated and parsed automatically. + For plain text logs, only message is populated with INFO level default. + Returns None only for empty lines. + """ + line = log_line.strip() + if not line: + return None + + if line.startswith("{") and line.endswith("}"): + try: + return LogEntry.model_validate_json(line) + except Exception: + pass + + old_format = re.search( + r"^\[(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+UTC\]", line + ) + + timestamp = None + if old_format: + timestamp = old_format.group(1) + "Z" + line = line.replace(old_format.group(0), "").strip() + + return LogEntry( + message=line, + name=None, + level=LoggingLevels.INFO, + timestamp=timestamp, + ) + + +class PipelineLogsStorage: + """Helper class which buffers and stores logs to a given URI using a background thread.""" + + def __init__( + self, + logs_uri: str, + artifact_store: "BaseArtifactStore", + max_queue_size: int = LOGS_STORAGE_MAX_QUEUE_SIZE, + queue_timeout: int = LOGS_STORAGE_QUEUE_TIMEOUT, + write_interval: int = LOGS_WRITE_INTERVAL_SECONDS, + merge_files_interval: int = LOGS_MERGE_INTERVAL_SECONDS, + ) -> None: + """Initialization. + + Args: + logs_uri: the URI of the log file or folder. + artifact_store: Artifact Store from the current step context + max_queue_size: maximum number of individual messages to queue. + queue_timeout: timeout in seconds for putting items in queue when full. + - Positive value: Wait N seconds, then drop logs if queue still full + - Negative value: Block indefinitely until queue has space (never drop logs) + write_interval: the amount of seconds before the created files + get written to the artifact store. + merge_files_interval: the amount of seconds before the created files + get merged into a single file. + """ + # Parameters + self.logs_uri = logs_uri + self.max_queue_size = max_queue_size + self.queue_timeout = queue_timeout + self.write_interval = write_interval + self.merge_files_interval = merge_files_interval + + # State + self.artifact_store = artifact_store + + # Immutable filesystems state + self.last_merge_time = time.time() + + # Queue and log storage thread for async processing + self.log_queue: queue.Queue[str] = queue.Queue(maxsize=max_queue_size) + self.log_storage_thread: Optional[threading.Thread] = None + self.shutdown_event = threading.Event() + self.merge_event = threading.Event() + + # Start the log storage thread + self._start_log_storage_thread() + + def _start_log_storage_thread(self) -> None: + """Start the log storage thread for processing log queue.""" + if ( + self.log_storage_thread is None + or not self.log_storage_thread.is_alive() + ): + self.log_storage_thread = threading.Thread( + target=self._log_storage_worker, + name="LogsStorage-Worker", + ) + self.log_storage_thread.start() + + def _process_log_queue(self, force_merge: bool = False) -> None: + """Write and merge logs to the artifact store using time-based batching. + + Args: + force_merge: Whether to force merge the logs. + """ + try: + messages = [] + + # Get first message (blocking with timeout) + try: + first_message = self.log_queue.get(timeout=1) + messages.append(first_message) + except queue.Empty: + return + + # Get any remaining messages without waiting (drain quickly) + while True: + try: + additional_message = self.log_queue.get_nowait() + messages.append(additional_message) + except queue.Empty: + break + + # Write the messages to the artifact store + if messages: + self.write_buffer(messages) + + # Merge the log files if needed + if ( + self._is_merge_needed + or self.merge_event.is_set() + or force_merge + ): + self.merge_event.clear() + + self.merge_log_files(merge_all_files=force_merge) + + except Exception as e: + logger.error("Error in log storage thread: %s", e) + finally: + for _ in messages: + self.log_queue.task_done() + + # Wait for the next write interval or until shutdown is requested + self.shutdown_event.wait(timeout=self.write_interval) + + def _log_storage_worker(self) -> None: + """Log storage thread worker that processes the log queue.""" + # Process the log queue until shutdown is requested + while not self.shutdown_event.is_set(): + self._process_log_queue() + + # Shutdown requested - drain remaining queue items and merge log files + self._process_log_queue(force_merge=True) + + def _shutdown_log_storage_thread(self, timeout: int = 5) -> None: + """Shutdown the log storage thread gracefully. + + Args: + timeout: Maximum time to wait for thread shutdown. + """ + if self.log_storage_thread and self.log_storage_thread.is_alive(): + # Then signal the worker to begin graceful shutdown + self.shutdown_event.set() + + # Wait for thread to finish (it will drain the queue automatically) + self.log_storage_thread.join(timeout=timeout) + + def write(self, text: str) -> None: + """Main write method that sends individual messages directly to queue. + + Args: + text: the incoming string. + """ + # Skip empty lines + if text == "\n": + return + + # If the current thread is the log storage thread, do nothing + # to prevent recursion when the storage thread itself generates logs + if ( + self.log_storage_thread + and threading.current_thread() == self.log_storage_thread + ): + return + + # If the current thread is the fsspec IO thread, do nothing + if self._is_fsspec_io_thread: + return + + try: + # Send individual message directly to queue + if not self.shutdown_event.is_set(): + try: + if self.queue_timeout < 0: + # Negative timeout = block indefinitely until queue has space + # Guarantees no log loss but may hang application + self.log_queue.put(text) + else: + # Positive timeout = wait specified time then drop logs + # Prevents application hanging but may lose logs + self.log_queue.put(text, timeout=self.queue_timeout) + except queue.Full: + # This only happens with positive timeout + # Queue is full - just skip this message to avoid blocking + # Better to drop logs than hang the application + pass + + except Exception: + # Silently ignore errors to prevent recursion + pass + + @property + def _is_merge_needed(self) -> bool: + """Checks whether the log files need to be merged. + + Returns: + whether the log files need to be merged. + """ + return ( + self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM + and time.time() - self.last_merge_time > self.merge_files_interval + ) + + @property + def _is_fsspec_io_thread(self) -> bool: + """Checks if the current thread is the fsspec IO thread. + + Returns: + whether the current thread is the fsspec IO thread. + """ + # Most artifact stores are based on fsspec, which converts between + # sync and async operations by using a separate AIO thread. + # It may happen that the fsspec call itself will log something, + # which will trigger this method, which may then use fsspec again, + # causing a "Calling sync() from within a running loop" error, because + # the fsspec library does not expect sync calls being made as a result + # of a logging call made by itself. + # To avoid this, we simply check if we're running in the fsspec AIO + # thread and skip the save if that's the case. + try: + return ( + asyncio.events.get_running_loop() is not None + and threading.current_thread().name == "fsspecIO" + ) + except RuntimeError: + # No running loop + return False + + def _get_timestamped_filename(self, suffix: str = "") -> str: + """Returns a timestamped filename. + + Args: + suffix: optional suffix for the file name + + Returns: + The timestamped filename. + """ + return f"{time.time()}{suffix}{LOGS_EXTENSION}" + + def write_buffer(self, buffer_to_write: List[str]) -> None: + """Write the given buffer to file. This runs in the log storage thread. + + Args: + buffer_to_write: The buffer contents to write to file. + """ + if not buffer_to_write: + return + + try: + # If the artifact store is immutable, write the buffer to a new file + if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + _logs_uri = self._get_timestamped_filename() + with self.artifact_store.open( + os.path.join( + self.logs_uri, + _logs_uri, + ), + "w", + ) as file: + for message in buffer_to_write: + file.write(f"{message}\n") + + # If the artifact store is mutable, append the buffer to the existing file + else: + with self.artifact_store.open(self.logs_uri, "a") as file: + for message in buffer_to_write: + file.write(f"{message}\n") + self.artifact_store._remove_previous_file_versions( + self.logs_uri + ) + + except Exception as e: + logger.error("Error in log storage thread: %s", e) + + def merge_log_files(self, merge_all_files: bool = False) -> None: + """Merges all log files into one in the given URI. + + Called on the logging context exit. + + Args: + merge_all_files: whether to merge all files or only raw files + """ + from zenml.artifacts.utils import ( + _load_file_from_artifact_store, + ) + + # If the artifact store is immutable, merge the log files + if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + merged_file_suffix = "_merged" + files_ = self.artifact_store.listdir(self.logs_uri) + if not merge_all_files: + # already merged files will not be merged again + files_ = [ + f for f in files_ if merged_file_suffix not in str(f) + ] + file_name_ = self._get_timestamped_filename( + suffix=merged_file_suffix + ) + if len(files_) > 1: + files_.sort() + logger.debug("Log files count: %s", len(files_)) + + missing_files = set() + # dump all logs to a local file first + with self.artifact_store.open( + os.path.join(self.logs_uri, file_name_), "w" + ) as merged_file: + for file in files_: + try: + merged_file.write( + str( + _load_file_from_artifact_store( + os.path.join(self.logs_uri, str(file)), + artifact_store=self.artifact_store, + mode="r", + ) + ) + ) + except DoesNotExistException: + missing_files.add(file) + + # clean up left over files + for file in files_: + if file not in missing_files: + self.artifact_store.remove( + os.path.join(self.logs_uri, str(file)) + ) + + # Update the last merge time + self.last_merge_time = time.time() + + def send_merge_event(self) -> None: + """Send a merge event to the log storage thread.""" + self.merge_event.set() + + +class ArtifactStoreHandler(logging.Handler): + """Handler that writes log messages to artifact store storage.""" + + def __init__(self, storage: "PipelineLogsStorage"): + """Initialize the handler with a storage instance. + + Args: + storage: The PipelineLogsStorage instance to write to. + """ + super().__init__() + self.storage = storage + + # Get storage log level from environment + self.setLevel(get_storage_log_level().value) + + def emit(self, record: logging.LogRecord) -> None: + """Emit a log record to the storage. + + Args: + record: The log record to emit. + """ + try: + # Get level enum + level = LoggingLevels.__members__.get(record.levelname.upper()) + + # Get the message + message = self.format(record) + message = remove_ansi_escape_codes(message).rstrip() + + # Check if message needs to be chunked + message_bytes = message.encode("utf-8") + if len(message_bytes) <= DEFAULT_MESSAGE_SIZE: + # Message is small enough, emit as-is + log_record = LogEntry.model_construct( + message=message, + name=record.name, + level=level, + timestamp=utc_now(tz_aware=True), + module=record.module, + filename=record.filename, + lineno=record.lineno, + ) + json_line = log_record.model_dump_json(exclude_none=True) + self.storage.write(json_line) + else: + # Message is too large, split into chunks and emit each one + chunks = self._split_to_chunks(message) + entry_id = uuid4() + for i, chunk in enumerate(chunks): + log_record = LogEntry.model_construct( + message=chunk, + name=record.name, + level=level, + module=record.module, + filename=record.filename, + lineno=record.lineno, + timestamp=utc_now(tz_aware=True), + chunk_index=i, + total_chunks=len(chunks), + id=entry_id, + ) + + json_line = log_record.model_dump_json(exclude_none=True) + self.storage.write(json_line) + except Exception: + pass + + def _split_to_chunks(self, message: str) -> List[str]: + """Split a large message into chunks. + + Args: + message: The message to split. + + Returns: + A list of message chunks. + """ + # Calculate how many chunks we need + message_bytes = message.encode("utf-8") + + # Split the message into chunks, handling UTF-8 boundaries + chunks = [] + start = 0 + + while start < len(message_bytes): + # Calculate the end position for this chunk + end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) + + # Try to decode the chunk, backing up if we hit a UTF-8 boundary issue + while end > start: + chunk_bytes = message_bytes[start:end] + try: + chunk_text = chunk_bytes.decode("utf-8") + chunks.append(chunk_text) + break + except UnicodeDecodeError: + # If we can't decode, try a smaller chunk + end -= 1 + else: + # If we can't decode anything, use replacement characters + end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) + chunks.append( + message_bytes[start:end].decode("utf-8", errors="replace") + ) + + start = end + + return chunks + + +class DefaultLogStore(BaseLogStore): + """Log store that saves logs to the artifact store. + + This implementation uses the artifact store as the backend for log storage, + maintaining backward compatibility with existing ZenML behavior. Logs are + written to the artifact store using a background thread and queue for + efficient batching. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Initialize the default log store. + + Args: + *args: Positional arguments for the base class. + **kwargs: Keyword arguments for the base class. + """ + client = Client() + self._artifact_store = client.active_stack.artifact_store + + self.storage: Optional["PipelineLogsStorage"] = None + self.handler: Optional["ArtifactStoreHandler"] = None + + self.uri: Optional[str] = None + self.artifact_store_id: Optional[UUID] = None + + self._original_root_level: Optional[int] = None + + @property + def config(self) -> DefaultLogStoreConfig: + """Returns the configuration of the default log store. + + Returns: + The configuration. + """ + return cast(DefaultLogStoreConfig, self._config) + + def activate( + self, + log_request: "LogsRequest", + ) -> None: + """Activate log collection to the artifact store. + + Args: + log_request: The log request model. + """ + logs_uri = prepare_logs_uri( + log_id=log_request.id, + artifact_store=self._artifact_store, + ) + + # Create storage and handler + self.storage = PipelineLogsStorage( + logs_uri=logs_uri, + artifact_store=self._artifact_store, + ) + self.handler = ArtifactStoreHandler(self.storage) + + # Add handler to root logger + root_logger = logging.getLogger() + root_logger.addHandler(self.handler) + + # Set root logger level to minimum of all handlers + self._original_root_level = root_logger.level + handler_levels = [handler.level for handler in root_logger.handlers] + min_level = min(handler_levels) + if min_level < root_logger.level: + root_logger.setLevel(min_level) + + # Add to context variables for print capture + logging_handlers.add(self.handler) + + def deactivate(self) -> None: + """Deactivate log collection and flush remaining logs.""" + if not self.handler: + return + + # Remove handler from root logger + root_logger = logging.getLogger() + if self.handler in root_logger.handlers: + root_logger.removeHandler(self.handler) + + # Restore original root logger level + if self._original_root_level is not None: + root_logger.setLevel(self._original_root_level) + + # Remove from context variables + logging_handlers.remove(self.handler) + + # Shutdown storage thread (flushes and merges logs) + if self.storage: + try: + self.storage._shutdown_log_storage_thread() + except Exception as e: + logger.warning(f"Error shutting down log storage: {e}") + + logger.debug("DefaultLogStore deactivated") + + def fetch( + self, + logs_model: "LogsResponse", + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None, + limit: int = 20000, + ) -> List["LogEntry"]: + """Fetch logs from the artifact store. + + Args: + logs_model: The logs model containing uri and artifact_store_id. + start_time: Filter logs after this time. + end_time: Filter logs before this time. + limit: Maximum number of log entries to return. + + Returns: + List of log entries from the artifact store. + + Raises: + ValueError: If logs_model.uri is not provided. + """ + from zenml.logging.step_logging import fetch_log_records + + if not logs_model.uri: + raise ValueError( + "logs_model.uri is required for DefaultLogStore.fetch()" + ) + + if not logs_model.artifact_store_id: + raise ValueError( + "logs_model.artifact_store_id is required for DefaultLogStore.fetch()" + ) + + client = Client() + log_entries = fetch_log_records( + zen_store=client.zen_store, + artifact_store_id=logs_model.artifact_store_id, + logs_uri=logs_model.uri, + ) + + if start_time or end_time: + filtered_entries = [] + for entry in log_entries: + if entry.timestamp: + if start_time and entry.timestamp < start_time: + continue + if end_time and entry.timestamp > end_time: + continue + filtered_entries.append(entry) + log_entries = filtered_entries + + return log_entries[:limit] diff --git a/src/zenml/log_stores/default/default_log_store_flavor.py b/src/zenml/log_stores/default/default_log_store_flavor.py new file mode 100644 index 00000000000..c9a8850df70 --- /dev/null +++ b/src/zenml/log_stores/default/default_log_store_flavor.py @@ -0,0 +1,101 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Default log store flavor implementation.""" + +from typing import TYPE_CHECKING, Type + +from zenml.enums import StackComponentType +from zenml.log_stores.base_log_store import BaseLogStoreConfig +from zenml.stack.flavor import Flavor + +if TYPE_CHECKING: + from zenml.log_stores.base_log_store import BaseLogStore + + +class DefaultLogStoreConfig(BaseLogStoreConfig): + """Configuration for the default log store. + + This log store saves logs to the artifact store, which is the default + and backward-compatible approach. + """ + + +class DefaultLogStoreFlavor(Flavor): + """Default log store flavor implementation.""" + + @property + def name(self) -> str: + """Name of the flavor. + + Returns: + The name of the flavor. + """ + return "default" + + @property + def docs_url(self) -> str: + """URL to the flavor documentation. + + Returns: + The URL to the flavor documentation. + """ + return "https://docs.zenml.io/stack-components/log-stores/default" + + @property + def sdk_docs_url(self) -> str: + """URL to the SDK docs for this flavor. + + Returns: + The URL to the SDK docs for this flavor. + """ + return self.docs_url + + @property + def logo_url(self) -> str: + """URL to the flavor logo. + + Returns: + The URL to the flavor logo. + """ + # TODO: Add a logo for the default log store + return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/log_store/default.png" + + @property + def type(self) -> StackComponentType: + """Stack component type. + + Returns: + The stack component type. + """ + return StackComponentType.LOG_STORE + + @property + def config_class(self) -> Type[BaseLogStoreConfig]: + """Returns `DefaultLogStoreConfig` config class. + + Returns: + The config class. + """ + return DefaultLogStoreConfig + + @property + def implementation_class(self) -> Type["BaseLogStore"]: + """Implementation class for this flavor. + + Returns: + The implementation class. + """ + from zenml.log_stores.default.default_log_store import DefaultLogStore + + return DefaultLogStore diff --git a/src/zenml/log_stores/default_log_store.py b/src/zenml/log_stores/default_log_store.py deleted file mode 100644 index 30e774bf1e0..00000000000 --- a/src/zenml/log_stores/default_log_store.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright (c) ZenML GmbH 2025. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing -# permissions and limitations under the License. -"""Default log store implementation.""" - -import logging -from datetime import datetime -from typing import TYPE_CHECKING, Any, List, Optional, cast -from uuid import UUID - -from zenml.client import Client -from zenml.log_stores.base_log_store import BaseLogStore, BaseLogStoreConfig -from zenml.logger import get_logger, logging_handlers - -if TYPE_CHECKING: - from zenml.artifact_stores import BaseArtifactStore - from zenml.logging.step_logging import ( - ArtifactStoreHandler, - LogEntry, - PipelineLogsStorage, - ) - from zenml.models import LogsResponse - -logger = get_logger(__name__) - - -class DefaultLogStoreConfig(BaseLogStoreConfig): - """Configuration for the default log store. - - This log store saves logs to the artifact store, which is the default - and backward-compatible approach. - """ - - -class DefaultLogStore(BaseLogStore): - """Log store that saves logs to the artifact store. - - This implementation uses the artifact store as the backend for log storage, - maintaining backward compatibility with existing ZenML behavior. Logs are - written to the artifact store using a background thread and queue for - efficient batching. - """ - - def __init__(self, *args: Any, **kwargs: Any) -> None: - """Initialize the default log store. - - Args: - *args: Positional arguments for the base class. - **kwargs: Keyword arguments for the base class. - """ - super().__init__(*args, **kwargs) - self.storage: Optional["PipelineLogsStorage"] = None - self.handler: Optional["ArtifactStoreHandler"] = None - self._artifact_store: Optional["BaseArtifactStore"] = None - self._original_root_level: Optional[int] = None - - @property - def config(self) -> DefaultLogStoreConfig: - """Returns the configuration of the default log store. - - Returns: - The configuration. - """ - return cast(DefaultLogStoreConfig, self._config) - - def activate( - self, - pipeline_run_id: UUID, - step_id: Optional[UUID] = None, - source: str = "step", - ) -> None: - """Activate log collection to the artifact store. - - Args: - pipeline_run_id: The ID of the pipeline run. - step_id: The ID of the step (if collecting step logs). - source: The source of the logs (e.g., "step", "orchestrator"). - """ - from zenml.logging.step_logging import ( - ArtifactStoreHandler, - PipelineLogsStorage, - prepare_logs_uri, - ) - - # Get the artifact store from the active stack - client = Client() - self._artifact_store = client.active_stack.artifact_store - - # Prepare logs URI - step_name = None - if step_id: - try: - step_run = client.get_pipeline_run_step(step_id) - step_name = step_run.name - except Exception: - pass - - logs_uri = prepare_logs_uri( - artifact_store=self._artifact_store, - step_name=step_name, - ) - - # Create storage and handler - self.storage = PipelineLogsStorage( - logs_uri=logs_uri, - artifact_store=self._artifact_store, - ) - self.handler = ArtifactStoreHandler(self.storage) - - # Add handler to root logger - root_logger = logging.getLogger() - root_logger.addHandler(self.handler) - - # Set root logger level to minimum of all handlers - self._original_root_level = root_logger.level - handler_levels = [handler.level for handler in root_logger.handlers] - min_level = min(handler_levels) - if min_level < root_logger.level: - root_logger.setLevel(min_level) - - # Add to context variables for print capture - logging_handlers.add(self.handler) - - logger.debug( - f"DefaultLogStore activated for {source} " - f"(pipeline_run={pipeline_run_id}, step={step_id})" - ) - - def deactivate(self) -> None: - """Deactivate log collection and flush remaining logs.""" - if not self.handler: - return - - # Remove handler from root logger - root_logger = logging.getLogger() - if self.handler in root_logger.handlers: - root_logger.removeHandler(self.handler) - - # Restore original root logger level - if self._original_root_level is not None: - root_logger.setLevel(self._original_root_level) - - # Remove from context variables - logging_handlers.remove(self.handler) - - # Shutdown storage thread (flushes and merges logs) - if self.storage: - try: - self.storage._shutdown_log_storage_thread() - except Exception as e: - logger.warning(f"Error shutting down log storage: {e}") - - logger.debug("DefaultLogStore deactivated") - - def fetch( - self, - logs_model: "LogsResponse", - start_time: Optional[datetime] = None, - end_time: Optional[datetime] = None, - limit: int = 20000, - ) -> List["LogEntry"]: - """Fetch logs from the artifact store. - - Args: - logs_model: The logs model containing uri and artifact_store_id. - start_time: Filter logs after this time. - end_time: Filter logs before this time. - limit: Maximum number of log entries to return. - - Returns: - List of log entries from the artifact store. - - Raises: - ValueError: If logs_model.uri is not provided. - """ - from zenml.logging.step_logging import fetch_log_records - - if not logs_model.uri: - raise ValueError( - "logs_model.uri is required for DefaultLogStore.fetch()" - ) - - if not logs_model.artifact_store_id: - raise ValueError( - "logs_model.artifact_store_id is required for DefaultLogStore.fetch()" - ) - - client = Client() - log_entries = fetch_log_records( - zen_store=client.zen_store, - artifact_store_id=logs_model.artifact_store_id, - logs_uri=logs_model.uri, - ) - - if start_time or end_time: - filtered_entries = [] - for entry in log_entries: - if entry.timestamp: - if start_time and entry.timestamp < start_time: - continue - if end_time and entry.timestamp > end_time: - continue - filtered_entries.append(entry) - log_entries = filtered_entries - - return log_entries[:limit] diff --git a/src/zenml/log_stores/otel/__init__.py b/src/zenml/log_stores/otel/__init__.py index 560a7220a1d..209bede4849 100644 --- a/src/zenml/log_stores/otel/__init__.py +++ b/src/zenml/log_stores/otel/__init__.py @@ -11,17 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""OpenTelemetry log store implementation.""" - -from zenml.log_stores.otel.otel_flavor import OtelLogStoreFlavor -from zenml.log_stores.otel.otel_log_store import ( - OtelLogStore, - OtelLogStoreConfig, -) - -__all__ = [ - "OtelLogStore", - "OtelLogStoreConfig", - "OtelLogStoreFlavor", -] - +"""OpenTelemetry log store implementation.""" \ No newline at end of file diff --git a/src/zenml/log_stores/utils.py b/src/zenml/log_stores/utils.py index 35c3ad0579e..15535a70b5c 100644 --- a/src/zenml/log_stores/utils.py +++ b/src/zenml/log_stores/utils.py @@ -52,7 +52,7 @@ def fetch_logs( log_store_model = zen_store.get_stack_component(logs.log_store_id) log_store = StackComponent.from_model(log_store_model) else: - from zenml.log_stores.default_log_store import ( + from zenml.log_stores.default.default_log_store import ( DefaultLogStore, DefaultLogStoreConfig, ) diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/step_logging.py index 0382b578d2d..30046a2cdc2 100644 --- a/src/zenml/logging/step_logging.py +++ b/src/zenml/logging/step_logging.py @@ -13,55 +13,35 @@ # permissions and limitations under the License. """ZenML logging handler.""" -import asyncio -import logging -import os -import queue import re -import threading -import time from contextlib import nullcontext from contextvars import ContextVar from datetime import datetime from types import TracebackType from typing import ( Any, - Iterator, - List, Optional, Type, - Union, ) from uuid import UUID, uuid4 from pydantic import BaseModel, Field -from zenml.artifact_stores import BaseArtifactStore -from zenml.artifacts.utils import _load_artifact_store from zenml.client import Client from zenml.constants import ( ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, - LOGS_MERGE_INTERVAL_SECONDS, - LOGS_STORAGE_MAX_QUEUE_SIZE, - LOGS_STORAGE_QUEUE_TIMEOUT, - LOGS_WRITE_INTERVAL_SECONDS, handle_bool_env_var, ) from zenml.enums import LoggingLevels -from zenml.exceptions import DoesNotExistException -from zenml.log_stores.default_log_store import DefaultLogStore from zenml.logger import ( get_logger, - get_storage_log_level, ) from zenml.models import ( LogsRequest, LogsResponse, PipelineSnapshotResponse, ) -from zenml.utils.io_utils import sanitize_remote_path from zenml.utils.time_utils import utc_now -from zenml.zen_stores.base_zen_store import BaseZenStore logger = get_logger(__name__) @@ -119,649 +99,62 @@ class LogEntry(BaseModel): ) -class ArtifactStoreHandler(logging.Handler): - """Handler that writes log messages to artifact store storage.""" - - def __init__(self, storage: "PipelineLogsStorage"): - """Initialize the handler with a storage instance. - - Args: - storage: The PipelineLogsStorage instance to write to. - """ - super().__init__() - self.storage = storage - - # Get storage log level from environment - self.setLevel(get_storage_log_level().value) - - def emit(self, record: logging.LogRecord) -> None: - """Emit a log record to the storage. - - Args: - record: The log record to emit. - """ - try: - # Get level enum - level = LoggingLevels.__members__.get(record.levelname.upper()) - - # Get the message - message = self.format(record) - message = remove_ansi_escape_codes(message).rstrip() - - # Check if message needs to be chunked - message_bytes = message.encode("utf-8") - if len(message_bytes) <= DEFAULT_MESSAGE_SIZE: - # Message is small enough, emit as-is - log_record = LogEntry.model_construct( - message=message, - name=record.name, - level=level, - timestamp=utc_now(tz_aware=True), - module=record.module, - filename=record.filename, - lineno=record.lineno, - ) - json_line = log_record.model_dump_json(exclude_none=True) - self.storage.write(json_line) - else: - # Message is too large, split into chunks and emit each one - chunks = self._split_to_chunks(message) - entry_id = uuid4() - for i, chunk in enumerate(chunks): - log_record = LogEntry.model_construct( - message=chunk, - name=record.name, - level=level, - module=record.module, - filename=record.filename, - lineno=record.lineno, - timestamp=utc_now(tz_aware=True), - chunk_index=i, - total_chunks=len(chunks), - id=entry_id, - ) - - json_line = log_record.model_dump_json(exclude_none=True) - self.storage.write(json_line) - except Exception: - pass - - def _split_to_chunks(self, message: str) -> List[str]: - """Split a large message into chunks. - - Args: - message: The message to split. - - Returns: - A list of message chunks. - """ - # Calculate how many chunks we need - message_bytes = message.encode("utf-8") - - # Split the message into chunks, handling UTF-8 boundaries - chunks = [] - start = 0 - - while start < len(message_bytes): - # Calculate the end position for this chunk - end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) - - # Try to decode the chunk, backing up if we hit a UTF-8 boundary issue - while end > start: - chunk_bytes = message_bytes[start:end] - try: - chunk_text = chunk_bytes.decode("utf-8") - chunks.append(chunk_text) - break - except UnicodeDecodeError: - # If we can't decode, try a smaller chunk - end -= 1 - else: - # If we can't decode anything, use replacement characters - end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) - chunks.append( - message_bytes[start:end].decode("utf-8", errors="replace") - ) - - start = end - - return chunks - - -def remove_ansi_escape_codes(text: str) -> str: - """Auxiliary function to remove ANSI escape codes from a given string. - - Args: - text: the input string - - Returns: - the version of the input string where the escape codes are removed. - """ - return ansi_escape.sub("", text) - - -def parse_log_entry(log_line: str) -> Optional[LogEntry]: - """Parse a single log entry into a LogEntry object. - - Handles two formats: - 1. JSON format: {"timestamp": "...", "level": "...", "message": "...", "location": "..."} - Uses Pydantic's model_validate_json for automatic parsing and validation. - 2. Plain text: Any other text (defaults to INFO level) - - Args: - log_line: A single log line to parse - - Returns: - LogEntry object. For JSON logs, all fields are validated and parsed automatically. - For plain text logs, only message is populated with INFO level default. - Returns None only for empty lines. - """ - line = log_line.strip() - if not line: - return None - - if line.startswith("{") and line.endswith("}"): - try: - return LogEntry.model_validate_json(line) - except Exception: - pass - - old_format = re.search( - r"^\[(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+UTC\]", line - ) - - timestamp = None - if old_format: - timestamp = old_format.group(1) + "Z" - line = line.replace(old_format.group(0), "").strip() - - return LogEntry( - message=line, - name=None, - level=LoggingLevels.INFO, - timestamp=timestamp, - ) - - -def prepare_logs_uri( - artifact_store: "BaseArtifactStore", - step_name: Optional[str] = None, - log_key: Optional[str] = None, -) -> str: - """Generates and prepares a URI for the log file or folder for a step. - - Args: - artifact_store: The artifact store on which the artifact will be stored. - step_name: Name of the step. Skipped for global pipeline run logs. - log_key: The unique identification key of the log file. - - Returns: - The URI of the log storage (file or folder). - """ - if log_key is None: - log_key = str(uuid4()) - - subfolder = step_name or PIPELINE_RUN_LOGS_FOLDER - logs_base_uri = os.path.join(artifact_store.path, subfolder, "logs") - - # Create the dir - if not artifact_store.exists(logs_base_uri): - artifact_store.makedirs(logs_base_uri) - - # Delete the file if it already exists - if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - logs_uri = os.path.join(logs_base_uri, log_key) - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs directory {logs_uri} already exists! Removing old log directory..." - ) - artifact_store.rmtree(logs_uri) - - artifact_store.makedirs(logs_uri) - else: - logs_uri = os.path.join(logs_base_uri, f"{log_key}{LOGS_EXTENSION}") - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs file {logs_uri} already exists! Removing old log file..." - ) - artifact_store.remove(logs_uri) - - return sanitize_remote_path(logs_uri) - - -def fetch_log_records( - zen_store: "BaseZenStore", - artifact_store_id: Union[str, UUID], - logs_uri: str, -) -> List[LogEntry]: - """Fetches log entries. - - Args: - zen_store: The store in which the artifact is stored. - artifact_store_id: The ID of the artifact store. - logs_uri: The URI of the artifact (file or directory). - - Returns: - List of log entries. - """ - log_entries = [] - - for line in _stream_logs_line_by_line( - zen_store, artifact_store_id, logs_uri - ): - if log_entry := parse_log_entry(line): - log_entries.append(log_entry) - - if len(log_entries) >= MAX_ENTRIES_PER_REQUEST: - break - - return log_entries - - -def _stream_logs_line_by_line( - zen_store: "BaseZenStore", - artifact_store_id: Union[str, UUID], - logs_uri: str, -) -> Iterator[str]: - """Stream logs line by line without loading the entire file into memory. - - This generator yields log lines one by one, handling both single files - and directories with multiple log files. - - Args: - zen_store: The store in which the artifact is stored. - artifact_store_id: The ID of the artifact store. - logs_uri: The URI of the log file or directory. - - Yields: - Individual log lines as strings. - - Raises: - DoesNotExistException: If the artifact does not exist in the artifact store. - """ - artifact_store = _load_artifact_store(artifact_store_id, zen_store) - - try: - if not artifact_store.isdir(logs_uri): - # Single file case - with artifact_store.open(logs_uri, "r") as file: - for line in file: - yield line.rstrip("\n\r") - else: - # Directory case - may contain multiple log files - files = artifact_store.listdir(logs_uri) - if not files: - raise DoesNotExistException( - f"Folder '{logs_uri}' is empty in artifact store " - f"'{artifact_store.name}'." - ) - - # Sort files to read them in order - files.sort() - - for file in files: - file_path = os.path.join(logs_uri, str(file)) - with artifact_store.open(file_path, "r") as f: - for line in f: - yield line.rstrip("\n\r") - finally: - artifact_store.cleanup() - - -class PipelineLogsStorage: - """Helper class which buffers and stores logs to a given URI using a background thread.""" - - def __init__( - self, - logs_uri: str, - artifact_store: "BaseArtifactStore", - max_queue_size: int = LOGS_STORAGE_MAX_QUEUE_SIZE, - queue_timeout: int = LOGS_STORAGE_QUEUE_TIMEOUT, - write_interval: int = LOGS_WRITE_INTERVAL_SECONDS, - merge_files_interval: int = LOGS_MERGE_INTERVAL_SECONDS, - ) -> None: - """Initialization. - - Args: - logs_uri: the URI of the log file or folder. - artifact_store: Artifact Store from the current step context - max_queue_size: maximum number of individual messages to queue. - queue_timeout: timeout in seconds for putting items in queue when full. - - Positive value: Wait N seconds, then drop logs if queue still full - - Negative value: Block indefinitely until queue has space (never drop logs) - write_interval: the amount of seconds before the created files - get written to the artifact store. - merge_files_interval: the amount of seconds before the created files - get merged into a single file. - """ - # Parameters - self.logs_uri = logs_uri - self.max_queue_size = max_queue_size - self.queue_timeout = queue_timeout - self.write_interval = write_interval - self.merge_files_interval = merge_files_interval - - # State - self.artifact_store = artifact_store - - # Immutable filesystems state - self.last_merge_time = time.time() - - # Queue and log storage thread for async processing - self.log_queue: queue.Queue[str] = queue.Queue(maxsize=max_queue_size) - self.log_storage_thread: Optional[threading.Thread] = None - self.shutdown_event = threading.Event() - self.merge_event = threading.Event() - - # Start the log storage thread - self._start_log_storage_thread() - - def _start_log_storage_thread(self) -> None: - """Start the log storage thread for processing log queue.""" - if ( - self.log_storage_thread is None - or not self.log_storage_thread.is_alive() - ): - self.log_storage_thread = threading.Thread( - target=self._log_storage_worker, - name="LogsStorage-Worker", - ) - self.log_storage_thread.start() - - def _process_log_queue(self, force_merge: bool = False) -> None: - """Write and merge logs to the artifact store using time-based batching. - - Args: - force_merge: Whether to force merge the logs. - """ - try: - messages = [] - - # Get first message (blocking with timeout) - try: - first_message = self.log_queue.get(timeout=1) - messages.append(first_message) - except queue.Empty: - return - - # Get any remaining messages without waiting (drain quickly) - while True: - try: - additional_message = self.log_queue.get_nowait() - messages.append(additional_message) - except queue.Empty: - break - - # Write the messages to the artifact store - if messages: - self.write_buffer(messages) - - # Merge the log files if needed - if ( - self._is_merge_needed - or self.merge_event.is_set() - or force_merge - ): - self.merge_event.clear() - - self.merge_log_files(merge_all_files=force_merge) - - except Exception as e: - logger.error("Error in log storage thread: %s", e) - finally: - for _ in messages: - self.log_queue.task_done() - - # Wait for the next write interval or until shutdown is requested - self.shutdown_event.wait(timeout=self.write_interval) - - def _log_storage_worker(self) -> None: - """Log storage thread worker that processes the log queue.""" - # Process the log queue until shutdown is requested - while not self.shutdown_event.is_set(): - self._process_log_queue() - - # Shutdown requested - drain remaining queue items and merge log files - self._process_log_queue(force_merge=True) - - def _shutdown_log_storage_thread(self, timeout: int = 5) -> None: - """Shutdown the log storage thread gracefully. - - Args: - timeout: Maximum time to wait for thread shutdown. - """ - if self.log_storage_thread and self.log_storage_thread.is_alive(): - # Then signal the worker to begin graceful shutdown - self.shutdown_event.set() - - # Wait for thread to finish (it will drain the queue automatically) - self.log_storage_thread.join(timeout=timeout) - - def write(self, text: str) -> None: - """Main write method that sends individual messages directly to queue. - - Args: - text: the incoming string. - """ - # Skip empty lines - if text == "\n": - return - - # If the current thread is the log storage thread, do nothing - # to prevent recursion when the storage thread itself generates logs - if ( - self.log_storage_thread - and threading.current_thread() == self.log_storage_thread - ): - return - - # If the current thread is the fsspec IO thread, do nothing - if self._is_fsspec_io_thread: - return - - try: - # Send individual message directly to queue - if not self.shutdown_event.is_set(): - try: - if self.queue_timeout < 0: - # Negative timeout = block indefinitely until queue has space - # Guarantees no log loss but may hang application - self.log_queue.put(text) - else: - # Positive timeout = wait specified time then drop logs - # Prevents application hanging but may lose logs - self.log_queue.put(text, timeout=self.queue_timeout) - except queue.Full: - # This only happens with positive timeout - # Queue is full - just skip this message to avoid blocking - # Better to drop logs than hang the application - pass - - except Exception: - # Silently ignore errors to prevent recursion - pass - - @property - def _is_merge_needed(self) -> bool: - """Checks whether the log files need to be merged. - - Returns: - whether the log files need to be merged. - """ - return ( - self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM - and time.time() - self.last_merge_time > self.merge_files_interval - ) - - @property - def _is_fsspec_io_thread(self) -> bool: - """Checks if the current thread is the fsspec IO thread. - - Returns: - whether the current thread is the fsspec IO thread. - """ - # Most artifact stores are based on fsspec, which converts between - # sync and async operations by using a separate AIO thread. - # It may happen that the fsspec call itself will log something, - # which will trigger this method, which may then use fsspec again, - # causing a "Calling sync() from within a running loop" error, because - # the fsspec library does not expect sync calls being made as a result - # of a logging call made by itself. - # To avoid this, we simply check if we're running in the fsspec AIO - # thread and skip the save if that's the case. - try: - return ( - asyncio.events.get_running_loop() is not None - and threading.current_thread().name == "fsspecIO" - ) - except RuntimeError: - # No running loop - return False - - def _get_timestamped_filename(self, suffix: str = "") -> str: - """Returns a timestamped filename. - - Args: - suffix: optional suffix for the file name - - Returns: - The timestamped filename. - """ - return f"{time.time()}{suffix}{LOGS_EXTENSION}" - - def write_buffer(self, buffer_to_write: List[str]) -> None: - """Write the given buffer to file. This runs in the log storage thread. - - Args: - buffer_to_write: The buffer contents to write to file. - """ - if not buffer_to_write: - return - - try: - # If the artifact store is immutable, write the buffer to a new file - if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - _logs_uri = self._get_timestamped_filename() - with self.artifact_store.open( - os.path.join( - self.logs_uri, - _logs_uri, - ), - "w", - ) as file: - for message in buffer_to_write: - file.write(f"{message}\n") - - # If the artifact store is mutable, append the buffer to the existing file - else: - with self.artifact_store.open(self.logs_uri, "a") as file: - for message in buffer_to_write: - file.write(f"{message}\n") - self.artifact_store._remove_previous_file_versions( - self.logs_uri - ) - - except Exception as e: - logger.error("Error in log storage thread: %s", e) - - def merge_log_files(self, merge_all_files: bool = False) -> None: - """Merges all log files into one in the given URI. - - Called on the logging context exit. - - Args: - merge_all_files: whether to merge all files or only raw files - """ - from zenml.artifacts.utils import ( - _load_file_from_artifact_store, - ) - - # If the artifact store is immutable, merge the log files - if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - merged_file_suffix = "_merged" - files_ = self.artifact_store.listdir(self.logs_uri) - if not merge_all_files: - # already merged files will not be merged again - files_ = [ - f for f in files_ if merged_file_suffix not in str(f) - ] - file_name_ = self._get_timestamped_filename( - suffix=merged_file_suffix - ) - if len(files_) > 1: - files_.sort() - logger.debug("Log files count: %s", len(files_)) - - missing_files = set() - # dump all logs to a local file first - with self.artifact_store.open( - os.path.join(self.logs_uri, file_name_), "w" - ) as merged_file: - for file in files_: - try: - merged_file.write( - str( - _load_file_from_artifact_store( - os.path.join(self.logs_uri, str(file)), - artifact_store=self.artifact_store, - mode="r", - ) - ) - ) - except DoesNotExistException: - missing_files.add(file) - - # clean up left over files - for file in files_: - if file not in missing_files: - self.artifact_store.remove( - os.path.join(self.logs_uri, str(file)) - ) - - # Update the last merge time - self.last_merge_time = time.time() - - def send_merge_event(self) -> None: - """Send a merge event to the log storage thread.""" - self.merge_event.set() - - class LoggingContext: """Context manager which collects logs using a LogStore.""" - def __init__(self, source: str = "step") -> None: + def __init__(self, source: str) -> None: """Initialize the logging context. Args: source: An identifier for the source of the logs (e.g., "step", "orchestrator") """ - self.source = source - - self.log_request_id = uuid4() - + # Create the log store first if Client().active_stack.log_store: self.log_store = Client().active_stack.log_store else: - self.log_store = DefaultLogStore() + from zenml.log_stores import ( + DefaultLogStore, + DefaultLogStoreConfig, + DefaultLogStoreFlavor, + ) + + default_log_store_flavor = DefaultLogStoreFlavor() + + self.log_store = DefaultLogStore( + id=uuid4(), + name="temporary_default", + flavor=default_log_store_flavor.name, + type=default_log_store_flavor.type, + config=DefaultLogStoreConfig(), + environment={}, + user=Client().active_user.id, + created=utc_now(), + updated=utc_now(), + secrets=[], + ) - def create_log_request(self) -> "LogsRequest": + # Based on the source, generate the log request + self.source = source + self.log_request = self.generate_log_request() + + def generate_log_request(self) -> "LogsRequest": """Create a log request model. - In their structure, LogRequest objects do not feature an entity ID or type, they - are rather used within other request model that does the assignment automatically. - That's why everytime we start a context, we need to be able to create the - corresponding LogRequest object. + Returns: + The log request model. """ + from zenml.log_stores.default.default_log_store import DefaultLogStore + if isinstance(self.log_store, DefaultLogStore): return LogsRequest( - id=self.log_request_id, + id=uuid4(), source=self.source, uri=self.log_store.uri, artifact_store_id=self.log_store.artifact_store_id, ) else: return LogsRequest( - id=self.log_request_id, + id=uuid4(), source=self.source, log_store_id=self.log_store.id, ) @@ -772,7 +165,7 @@ def __enter__(self) -> "LoggingContext": Returns: self """ - self.log_store.activate(source=self.source) + self.log_store.activate(log_request=self.log_request) return self def __exit__( diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 6eaa957456b..9e5f367a45d 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -269,13 +269,8 @@ def launch(self) -> None: logs_model = None if step_logging_enabled: - from zenml.enums import LoggableEntityType - - logs_context = step_logging.LoggingContext( - source="step", - ) # type: ignore[assignment] - - logs_model = logs_context.create_log_request() + logs_context = step_logging.LoggingContext(source="step") + logs_model = logs_context.log_request with logs_context: if run_was_created: @@ -333,7 +328,8 @@ def launch(self) -> None: step_logging.LoggingContext, ): # For LoggingContext using DefaultLogStore, trigger merge - from zenml.log_stores.default_log_store import ( + # TODO: investigate + from zenml.log_stores.default.default_log_store import ( DefaultLogStore, ) diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index ef0790b9b03..9a662ff3e8c 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -151,9 +151,7 @@ def run( logs_context = nullcontext() if step_logging_enabled and not redirected.get(): if step_run.logs: - logs_context = LoggingContext( - source="step", - ) + logs_context = LoggingContext(source="step") else: logger.debug( "There is no LogsResponseModel prepared for the step. The" diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index 4f4984f9f4d..a0d012747a1 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -961,8 +961,8 @@ def _run( logs_request = None if logging_enabled: - logs_context = LoggingContext() - logs_request = logs_context.create_log_request() + logs_context = LoggingContext("client") + logs_request = logs_context.log_request with logs_context: snapshot = self._create_snapshot(**self._run_args) diff --git a/src/zenml/stack/flavor_registry.py b/src/zenml/stack/flavor_registry.py index 4c1fd139269..ded89656b70 100644 --- a/src/zenml/stack/flavor_registry.py +++ b/src/zenml/stack/flavor_registry.py @@ -69,7 +69,11 @@ def builtin_flavors(self) -> List[Type[Flavor]]: ) from zenml.deployers import DockerDeployerFlavor from zenml.image_builders import LocalImageBuilderFlavor - from zenml.log_stores import DatadogLogStoreFlavor, OtelLogStoreFlavor + from zenml.log_stores import ( + DatadogLogStoreFlavor, + DefaultLogStoreFlavor, + OtelLogStoreFlavor, + ) from zenml.orchestrators import ( LocalDockerOrchestratorFlavor, LocalOrchestratorFlavor, @@ -86,6 +90,7 @@ def builtin_flavors(self) -> List[Type[Flavor]]: GitHubContainerRegistryFlavor, LocalImageBuilderFlavor, DockerDeployerFlavor, + DefaultLogStoreFlavor, OtelLogStoreFlavor, DatadogLogStoreFlavor, ] From 185d2796e3249dc3c0df5002af238cbc43cd62f8 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 30 Oct 2025 13:37:09 +0100 Subject: [PATCH 07/81] solving things --- .../log_stores/default/default_log_store.py | 65 ++----------------- src/zenml/logging/step_logging.py | 58 +++++++++++++++-- .../zen_server/routers/runs_endpoints.py | 5 +- .../zen_server/routers/steps_endpoints.py | 2 +- .../versions/5c0a1c787128_add_log_stores.py | 3 +- 5 files changed, 66 insertions(+), 67 deletions(-) diff --git a/src/zenml/log_stores/default/default_log_store.py b/src/zenml/log_stores/default/default_log_store.py index 5af6b3757ef..43279430082 100644 --- a/src/zenml/log_stores/default/default_log_store.py +++ b/src/zenml/log_stores/default/default_log_store.py @@ -61,7 +61,6 @@ LogsRequest, LogsResponse, ) -from zenml.utils.io_utils import sanitize_remote_path from zenml.utils.time_utils import utc_now from zenml.zen_stores.base_zen_store import BaseZenStore @@ -93,44 +92,6 @@ def remove_ansi_escape_codes(text: str) -> str: return ansi_escape.sub("", text) -def prepare_logs_uri( - artifact_store: "BaseArtifactStore", - log_id: UUID, -) -> str: - """Generates and prepares a URI for the log file or folder for a step. - - Args: - artifact_store: The artifact store on which the artifact will be stored. - log_id: The ID of the logs entity - - Returns: - The URI of the log storage (file or folder). - """ - logs_base_uri = os.path.join(artifact_store.path, "logs") - - if not artifact_store.exists(logs_base_uri): - artifact_store.makedirs(logs_base_uri) - - if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - logs_uri = os.path.join(logs_base_uri, log_id) - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs directory {logs_uri} already exists! Removing old log directory..." - ) - artifact_store.rmtree(logs_uri) - - artifact_store.makedirs(logs_uri) - else: - logs_uri = os.path.join(logs_base_uri, f"{log_id}{LOGS_EXTENSION}") - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs file {logs_uri} already exists! Removing old log file..." - ) - artifact_store.remove(logs_uri) - - return sanitize_remote_path(logs_uri) - - def fetch_log_records( zen_store: "BaseZenStore", artifact_store_id: Union[str, UUID], @@ -253,7 +214,7 @@ def parse_log_entry(log_line: str) -> Optional[LogEntry]: ) -class PipelineLogsStorage: +class LogsStorage: """Helper class which buffers and stores logs to a given URI using a background thread.""" def __init__( @@ -696,15 +657,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: *args: Positional arguments for the base class. **kwargs: Keyword arguments for the base class. """ - client = Client() - self._artifact_store = client.active_stack.artifact_store - - self.storage: Optional["PipelineLogsStorage"] = None + self.storage: Optional["LogsStorage"] = None self.handler: Optional["ArtifactStoreHandler"] = None - self.uri: Optional[str] = None - self.artifact_store_id: Optional[UUID] = None - self._original_root_level: Optional[int] = None @property @@ -716,24 +671,16 @@ def config(self) -> DefaultLogStoreConfig: """ return cast(DefaultLogStoreConfig, self._config) - def activate( - self, - log_request: "LogsRequest", - ) -> None: + def activate(self, log_request: "LogsRequest") -> None: """Activate log collection to the artifact store. Args: log_request: The log request model. """ - logs_uri = prepare_logs_uri( - log_id=log_request.id, - artifact_store=self._artifact_store, - ) - # Create storage and handler - self.storage = PipelineLogsStorage( - logs_uri=logs_uri, - artifact_store=self._artifact_store, + self.storage = LogsStorage( + logs_uri=log_request.uri, + artifact_store=Client().active_stack.artifact_store, ) self.handler = ArtifactStoreHandler(self.storage) diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/step_logging.py index 30046a2cdc2..92b3066def5 100644 --- a/src/zenml/logging/step_logging.py +++ b/src/zenml/logging/step_logging.py @@ -13,12 +13,14 @@ # permissions and limitations under the License. """ZenML logging handler.""" +import os import re from contextlib import nullcontext from contextvars import ContextVar from datetime import datetime from types import TracebackType from typing import ( + TYPE_CHECKING, Any, Optional, Type, @@ -41,8 +43,12 @@ LogsResponse, PipelineSnapshotResponse, ) +from zenml.utils.io_utils import sanitize_remote_path from zenml.utils.time_utils import utc_now +if TYPE_CHECKING: + from zenml.artifact_stores import BaseArtifactStore + logger = get_logger(__name__) # Context variables @@ -59,6 +65,44 @@ DEFAULT_MESSAGE_SIZE = 5 * 1024 +def prepare_logs_uri( + artifact_store: "BaseArtifactStore", + log_id: UUID, +) -> str: + """Generates and prepares a URI for the log file or folder for a step. + + Args: + artifact_store: The artifact store on which the artifact will be stored. + log_id: The ID of the logs entity + + Returns: + The URI of the log storage (file or folder). + """ + logs_base_uri = os.path.join(artifact_store.path, "logs") + + if not artifact_store.exists(logs_base_uri): + artifact_store.makedirs(logs_base_uri) + + if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + logs_uri = os.path.join(logs_base_uri, log_id) + if artifact_store.exists(logs_uri): + logger.warning( + f"Logs directory {logs_uri} already exists! Removing old log directory..." + ) + artifact_store.rmtree(logs_uri) + + artifact_store.makedirs(logs_uri) + else: + logs_uri = os.path.join(logs_base_uri, f"{log_id}{LOGS_EXTENSION}") + if artifact_store.exists(logs_uri): + logger.warning( + f"Logs file {logs_uri} already exists! Removing old log file..." + ) + artifact_store.remove(logs_uri) + + return sanitize_remote_path(logs_uri) + + class LogEntry(BaseModel): """A structured log entry with parsed information.""" @@ -146,11 +190,17 @@ def generate_log_request(self) -> "LogsRequest": from zenml.log_stores.default.default_log_store import DefaultLogStore if isinstance(self.log_store, DefaultLogStore): + log_id = uuid4() + artifact_store = Client().active_stack.artifact_store + return LogsRequest( - id=uuid4(), + id=log_id, source=self.source, - uri=self.log_store.uri, - artifact_store_id=self.log_store.artifact_store_id, + uri=prepare_logs_uri( + artifact_store=artifact_store, + log_id=log_id, + ), + artifact_store_id=artifact_store.id, ) else: return LogsRequest( @@ -193,7 +243,6 @@ def __exit__( def setup_orchestrator_logging( run_id: UUID, snapshot: "PipelineSnapshotResponse", - logs_response: Optional[LogsResponse] = None, ) -> Any: """Set up logging for an orchestrator environment. @@ -208,6 +257,7 @@ def setup_orchestrator_logging( Returns: The logs context """ + # TODO: we need to establish the connection here again. try: logging_enabled = True diff --git a/src/zenml/zen_server/routers/runs_endpoints.py b/src/zenml/zen_server/routers/runs_endpoints.py index 7f50b2adc84..9b1b02559d6 100644 --- a/src/zenml/zen_server/routers/runs_endpoints.py +++ b/src/zenml/zen_server/routers/runs_endpoints.py @@ -35,7 +35,6 @@ from zenml.logging.step_logging import ( MAX_ENTRIES_PER_REQUEST, LogEntry, - parse_log_entry, ) from zenml.models import ( Page, @@ -469,6 +468,10 @@ def run_logs( if ( snapshot.template_id or snapshot.source_snapshot_id ) and server_config().workload_manager_enabled: + from zenml.log_stores.default.default_log_store import ( + parse_log_entry, + ) + workload_logs = workload_manager().get_logs( workload_id=snapshot.id ) diff --git a/src/zenml/zen_server/routers/steps_endpoints.py b/src/zenml/zen_server/routers/steps_endpoints.py index 0014a8ed653..b7c2ef2abbc 100644 --- a/src/zenml/zen_server/routers/steps_endpoints.py +++ b/src/zenml/zen_server/routers/steps_endpoints.py @@ -21,7 +21,6 @@ from zenml.constants import ( API, LOGS, - MAX_ENTRIES_PER_REQUEST, STATUS, STEP_CONFIGURATION, STEPS, @@ -30,6 +29,7 @@ from zenml.enums import ExecutionStatus from zenml.log_stores import fetch_logs from zenml.logging.step_logging import ( + MAX_ENTRIES_PER_REQUEST, LogEntry, ) from zenml.models import ( diff --git a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py index fcbbd89b0f7..b83e4052aea 100644 --- a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py +++ b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py @@ -11,9 +11,8 @@ from alembic import op # revision identifiers, used by Alembic. -# TODO: I WILL HAVE TO CHANGE THIS revision = "5c0a1c787128" -down_revision = "124b57b8c7b1" +down_revision = "0.91.0" branch_labels = None depends_on = None From f4a62fb9a5810fe28b70d3808efbba590e104eec Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 30 Oct 2025 14:34:09 +0100 Subject: [PATCH 08/81] some other checkpoint --- src/zenml/log_stores/__init__.py | 15 +++-- .../log_stores/datadog/datadog_flavor.py | 31 +++++++++- .../log_stores/datadog/datadog_log_store.py | 34 +---------- src/zenml/log_stores/otel/otel_flavor.py | 55 ++++++++++++++++- src/zenml/log_stores/otel/otel_log_store.py | 59 +------------------ src/zenml/logging/step_logging.py | 1 - 6 files changed, 94 insertions(+), 101 deletions(-) diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index 218d3b7c19d..ed602a92728 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -26,22 +26,27 @@ # Default log store from zenml.log_stores.default.default_log_store import ( DefaultLogStore, - DefaultLogStoreConfig, ) from zenml.log_stores.default.default_log_store_flavor import ( + DefaultLogStoreConfig, DefaultLogStoreFlavor, ) # OpenTelemetry log store -from zenml.log_stores.otel.otel_log_store import OtelLogStore, OtelLogStoreConfig -from zenml.log_stores.otel.otel_flavor import OtelLogStoreFlavor +from zenml.log_stores.otel.otel_log_store import OtelLogStore +from zenml.log_stores.otel.otel_flavor import ( + OtelLogStoreConfig, + OtelLogStoreFlavor, +) # Datadog log store from zenml.log_stores.datadog.datadog_log_store import ( - DatadogLogStore, + DatadogLogStore, +) +from zenml.log_stores.datadog.datadog_flavor import ( DatadogLogStoreConfig, + DatadogLogStoreFlavor, ) -from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreFlavor # Utils from zenml.log_stores.utils import fetch_logs diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index 082907ecc65..ae5c0f2cfa6 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -13,14 +13,39 @@ # permissions and limitations under the License. """Datadog log store flavor.""" -from typing import TYPE_CHECKING, Type +from typing import TYPE_CHECKING, Dict, Type + +from pydantic import Field, SecretStr from zenml.enums import StackComponentType from zenml.log_stores import BaseLogStore, BaseLogStoreConfig +from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.stack.flavor import Flavor -if TYPE_CHECKING: - pass + + +class DatadogLogStoreConfig(OtelLogStoreConfig): + """Configuration for Datadog log store. + + This extends OtelLogStoreConfig with Datadog-specific settings. + + Attributes: + api_key: Datadog API key for log ingestion. + site: Datadog site (e.g., "datadoghq.com", "datadoghq.eu"). + additional_tags: Additional tags to add to all logs. + """ + + api_key: SecretStr = Field( + description="Datadog API key for log ingestion", + ) + site: str = Field( + default="datadoghq.com", + description="Datadog site (e.g., datadoghq.com, datadoghq.eu)", + ) + additional_tags: Dict[str, str] = Field( + default_factory=dict, + description="Additional tags to add to all logs", + ) class DatadogLogStoreFlavor(Flavor): diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index a64e5ff1d29..2c7b5c604e9 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -16,12 +16,7 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast -from pydantic import Field, SecretStr - -from zenml.log_stores.otel.otel_log_store import ( - OtelLogStore, - OtelLogStoreConfig, -) +from zenml.log_stores.otel.otel_log_store import OtelLogStore from zenml.logger import get_logger if TYPE_CHECKING: @@ -33,30 +28,6 @@ logger = get_logger(__name__) -class DatadogLogStoreConfig(OtelLogStoreConfig): - """Configuration for Datadog log store. - - This extends OtelLogStoreConfig with Datadog-specific settings. - - Attributes: - api_key: Datadog API key for log ingestion. - site: Datadog site (e.g., "datadoghq.com", "datadoghq.eu"). - additional_tags: Additional tags to add to all logs. - """ - - api_key: SecretStr = Field( - description="Datadog API key for log ingestion", - ) - site: str = Field( - default="datadoghq.com", - description="Datadog site (e.g., datadoghq.com, datadoghq.eu)", - ) - additional_tags: Dict[str, str] = Field( - default_factory=dict, - description="Additional tags to add to all logs", - ) - - class DatadogLogExporter: """Custom log exporter that sends logs to Datadog's HTTP intake API. @@ -77,8 +48,7 @@ def __init__( site: Datadog site domain. additional_tags: Additional tags to add to all logs. """ - self.api_key = api_key - self.endpoint = f"https://http-intake.logs.{site}/v1/input" + self.headers = { "DD-API-KEY": api_key, "Content-Type": "application/json", diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index ecb4a972ead..49cff18a246 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -13,13 +13,63 @@ # permissions and limitations under the License. """OpenTelemetry log store flavor.""" -from typing import Type +from typing import Dict, Optional, Type + +from pydantic import Field from zenml.enums import StackComponentType from zenml.log_stores import BaseLogStore, BaseLogStoreConfig from zenml.stack.flavor import Flavor +class OtelLogStoreConfig(BaseLogStoreConfig): + """Configuration for OpenTelemetry log store. + + Attributes: + service_name: Name of the service (defaults to "zenml"). + service_version: Version of the service. + deployment_environment: Deployment environment (e.g., "production"). + max_queue_size: Maximum queue size for batch processor. + schedule_delay_millis: Delay between batch exports in milliseconds. + max_export_batch_size: Maximum batch size for exports. + endpoint: Optional OTLP endpoint URL (for HTTP/gRPC exporters). + headers: Optional headers for OTLP exporter. + insecure: Whether to use insecure connection for OTLP. + """ + service_name: str = Field( + default="zenml", + description="Name of the service for telemetry", + ) + service_version: str = Field( + default="1.0.0", + description="Version of the service", + ) + max_queue_size: int = Field( + default=2048, + description="Maximum queue size for batch log processor", + ) + schedule_delay_millis: int = Field( + default=1000, + description="Export interval in milliseconds", + ) + max_export_batch_size: int = Field( + default=512, + description="Maximum batch size for exports", + ) + endpoint: Optional[str] = Field( + default=None, + description="OTLP endpoint URL", + ) + headers: Dict[str, str] = Field( + default_factory=dict, + description="Headers for OTLP exporter", + ) + insecure: bool = Field( + default=False, + description="Whether to use insecure connection", + ) + + class OtelLogStoreFlavor(Flavor): """OpenTelemetry log store flavor.""" @@ -57,6 +107,7 @@ def logo_url(self) -> str: Returns: The URL to the flavor logo. """ + # TODO: Add a logo for the OpenTelemetry log store return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/log_store/otel.png" @property @@ -75,8 +126,6 @@ def config_class(self) -> Type[BaseLogStoreConfig]: Returns: The config class. """ - from zenml.log_stores.otel.otel_log_store import OtelLogStoreConfig - return OtelLogStoreConfig @property diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index ec9ac0a9955..32a9de53b9a 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -16,12 +16,10 @@ import logging from abc import abstractmethod from datetime import datetime -from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +from typing import TYPE_CHECKING, Any, List, Optional, cast from uuid import UUID -from pydantic import Field - -from zenml.log_stores.base_log_store import BaseLogStore, BaseLogStoreConfig +from zenml.log_stores.base_log_store import BaseLogStore from zenml.logger import get_logger, get_storage_log_level, logging_handlers from zenml.utils.string_utils import random_str @@ -35,59 +33,6 @@ logger = get_logger(__name__) -class OtelLogStoreConfig(BaseLogStoreConfig): - """Configuration for OpenTelemetry log store. - - Attributes: - service_name: Name of the service (defaults to "zenml"). - service_version: Version of the service. - deployment_environment: Deployment environment (e.g., "production"). - max_queue_size: Maximum queue size for batch processor. - schedule_delay_millis: Delay between batch exports in milliseconds. - max_export_batch_size: Maximum batch size for exports. - endpoint: Optional OTLP endpoint URL (for HTTP/gRPC exporters). - headers: Optional headers for OTLP exporter. - insecure: Whether to use insecure connection for OTLP. - """ - - service_name: str = Field( - default="zenml", - description="Name of the service for telemetry", - ) - service_version: str = Field( - default="1.0.0", - description="Version of the service", - ) - deployment_environment: str = Field( - default="production", - description="Deployment environment", - ) - max_queue_size: int = Field( - default=2048, - description="Maximum queue size for batch log processor", - ) - schedule_delay_millis: int = Field( - default=1000, - description="Export interval in milliseconds", - ) - max_export_batch_size: int = Field( - default=512, - description="Maximum batch size for exports", - ) - endpoint: Optional[str] = Field( - default=None, - description="OTLP endpoint URL", - ) - headers: Dict[str, str] = Field( - default_factory=dict, - description="Headers for OTLP exporter", - ) - insecure: bool = Field( - default=False, - description="Whether to use insecure connection", - ) - - class OtelLogStore(BaseLogStore): """Log store that exports logs using OpenTelemetry. diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/step_logging.py index 92b3066def5..c43a4cb2983 100644 --- a/src/zenml/logging/step_logging.py +++ b/src/zenml/logging/step_logging.py @@ -40,7 +40,6 @@ ) from zenml.models import ( LogsRequest, - LogsResponse, PipelineSnapshotResponse, ) from zenml.utils.io_utils import sanitize_remote_path From c189af8c7178d2153a3c32b95e70646dae021ac1 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 30 Oct 2025 16:31:42 +0100 Subject: [PATCH 09/81] formatting --- src/zenml/log_stores/datadog/datadog_flavor.py | 3 +-- src/zenml/log_stores/otel/otel_flavor.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index ae5c0f2cfa6..5d0d8bc7885 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -13,7 +13,7 @@ # permissions and limitations under the License. """Datadog log store flavor.""" -from typing import TYPE_CHECKING, Dict, Type +from typing import Dict, Type from pydantic import Field, SecretStr @@ -23,7 +23,6 @@ from zenml.stack.flavor import Flavor - class DatadogLogStoreConfig(OtelLogStoreConfig): """Configuration for Datadog log store. diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index 49cff18a246..aac9a01570e 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -36,6 +36,7 @@ class OtelLogStoreConfig(BaseLogStoreConfig): headers: Optional headers for OTLP exporter. insecure: Whether to use insecure connection for OTLP. """ + service_name: str = Field( default="zenml", description="Name of the service for telemetry", From 929bd3aa69913fb2a21006514677cb94adf474c7 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 30 Oct 2025 16:56:27 +0100 Subject: [PATCH 10/81] new changes --- src/zenml/log_stores/base_log_store.py | 1 + .../log_stores/datadog/datadog_flavor.py | 4 - .../log_stores/datadog/datadog_log_store.py | 4 +- src/zenml/log_stores/otel/otel_log_store.py | 84 +++++-------------- src/zenml/stack/flavor_registry.py | 2 - 5 files changed, 25 insertions(+), 70 deletions(-) diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index c906b118cf7..e4e655a5ec5 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -46,6 +46,7 @@ def config(self) -> BaseLogStoreConfig: """ return cast(BaseLogStoreConfig, self._config) + # TODO: This should probably accept not just requests but also responses @abstractmethod def activate(self, log_request: "LogsRequest") -> None: """Activate the log store for log collection. diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index 5d0d8bc7885..46530599900 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -102,10 +102,6 @@ def config_class(self) -> Type[BaseLogStoreConfig]: Returns: The config class. """ - from zenml.log_stores.datadog.datadog_log_store import ( - DatadogLogStoreConfig, - ) - return DatadogLogStoreConfig @property diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 2c7b5c604e9..9f0fe697bf6 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -16,6 +16,7 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreConfig from zenml.log_stores.otel.otel_log_store import OtelLogStore from zenml.logger import get_logger @@ -162,8 +163,7 @@ class DatadogLogStore(OtelLogStore): """Log store that exports logs to Datadog. This implementation extends OtelLogStore and configures it to send logs - to Datadog's HTTP intake API. Logs are sent with appropriate tags including - pipeline_run_id, step_id, and source for easy filtering on Datadog. + to Datadog's HTTP intake API. """ @property diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 32a9de53b9a..8b7d4880719 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -19,9 +19,14 @@ from typing import TYPE_CHECKING, Any, List, Optional, cast from uuid import UUID +from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.resources import Resource + from zenml.log_stores.base_log_store import BaseLogStore +from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.logger import get_logger, get_storage_log_level, logging_handlers -from zenml.utils.string_utils import random_str +from zenml.models import LogsRequest if TYPE_CHECKING: from opentelemetry.sdk._logs import LoggerProvider @@ -79,53 +84,25 @@ def get_exporter(self) -> "LogExporter": The log exporter instance. """ - def activate( - self, - pipeline_run_id: UUID, - step_id: Optional[UUID] = None, - source: str = "step", - ) -> None: + def activate(self, log_request: "LogsRequest") -> None: """Activate log collection with OpenTelemetry. Args: - pipeline_run_id: The ID of the pipeline run. - step_id: The ID of the step (if collecting step logs). - source: The source of the logs (e.g., "step", "orchestrator"). + log_request: The log request model. """ - try: - from opentelemetry.sdk._logs import LoggerProvider - from opentelemetry.sdk._logs.export import BatchLogRecordProcessor - from opentelemetry.sdk.resources import Resource - except ImportError: - logger.error( - "OpenTelemetry SDK not installed. Install with: " - "pip install opentelemetry-sdk opentelemetry-exporter-otlp" - ) - return - - # Store metadata - self._pipeline_run_id = pipeline_run_id - self._step_id = step_id - self._source = source - - # Create resource with service information and ZenML metadata - resource_attributes = { - "service.name": self.config.service_name, - "service.version": self.config.service_version, - "service.instance.id": random_str(8), - "deployment.environment": self.config.deployment_environment, - "zenml.pipeline_run_id": str(pipeline_run_id), - "zenml.source": source, - } - if step_id: - resource_attributes["zenml.step_id"] = str(step_id) - - otel_resource = Resource.create(resource_attributes) + # Create resource + otel_resource = Resource.create( + { + "service.name": self.config.service_name, + "service.version": self.config.service_version, + "zenml.log_id": str(log_request.id), + } + ) # Create logger provider self.logger_provider = LoggerProvider(resource=otel_resource) - # Get exporter from subclass + # Get exporter exporter = self.get_exporter() # Create batch processor for efficient background processing @@ -138,18 +115,10 @@ def activate( self.logger_provider.add_log_record_processor(processor) # Create handler for Python logging integration - try: - from opentelemetry.sdk._logs import LoggingHandler - - self.handler = LoggingHandler( - level=get_storage_log_level().value, - logger_provider=self.logger_provider, - ) - except ImportError: - logger.error( - "Failed to import LoggingHandler from OpenTelemetry SDK" - ) - return + self.handler = LoggingHandler( + level=get_storage_log_level().value, + logger_provider=self.logger_provider, + ) # Add handler to root logger root_logger = logging.getLogger() @@ -165,11 +134,6 @@ def activate( # Add to context variables for print capture logging_handlers.add(self.handler) - logger.debug( - f"OtelLogStore activated for {source} " - f"(pipeline_run={pipeline_run_id}, step={step_id})" - ) - def deactivate(self) -> None: """Deactivate log collection and flush remaining logs.""" if not self.handler: @@ -199,6 +163,7 @@ def deactivate(self) -> None: logger.debug("OtelLogStore deactivated") + @abstractmethod def fetch( self, logs_model: "LogsResponse", @@ -221,8 +186,3 @@ def fetch( Returns: List of log entries from the backend. """ - logger.warning( - "OtelLogStore.fetch() not implemented. " - "Subclasses should override this method to query their backend." - ) - return [] diff --git a/src/zenml/stack/flavor_registry.py b/src/zenml/stack/flavor_registry.py index 4624ec6c369..7244e1263d5 100644 --- a/src/zenml/stack/flavor_registry.py +++ b/src/zenml/stack/flavor_registry.py @@ -72,7 +72,6 @@ def builtin_flavors(self) -> List[Type[Flavor]]: from zenml.log_stores import ( DatadogLogStoreFlavor, DefaultLogStoreFlavor, - OtelLogStoreFlavor, ) from zenml.orchestrators import ( LocalDockerOrchestratorFlavor, @@ -91,7 +90,6 @@ def builtin_flavors(self) -> List[Type[Flavor]]: LocalImageBuilderFlavor, DockerDeployerFlavor, DefaultLogStoreFlavor, - OtelLogStoreFlavor, DatadogLogStoreFlavor, LocalDeployerFlavor, ] From ee9973a4a53159661670052e535761cabe4cbab3 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 30 Oct 2025 17:21:31 +0100 Subject: [PATCH 11/81] checkpoint --- src/zenml/cli/stack.py | 11 ++++ .../log_stores/datadog/datadog_flavor.py | 7 +-- .../log_stores/datadog/datadog_log_store.py | 57 ++++--------------- src/zenml/log_stores/otel/otel_flavor.py | 18 +----- src/zenml/log_stores/otel/otel_log_store.py | 2 +- 5 files changed, 26 insertions(+), 69 deletions(-) diff --git a/src/zenml/cli/stack.py b/src/zenml/cli/stack.py index 6e82a0a9d32..cb336908973 100644 --- a/src/zenml/cli/stack.py +++ b/src/zenml/cli/stack.py @@ -203,6 +203,14 @@ def stack() -> None: type=str, required=False, ) +@click.option( + "-l", + "--log_store", + "log_store", + help="Name of the log store for this stack.", + type=str, + required=False, +) @click.option( "--set", "set_stack", @@ -256,6 +264,7 @@ def register_stack( data_validator: Optional[str] = None, image_builder: Optional[str] = None, deployer: Optional[str] = None, + log_store: Optional[str] = None, set_stack: bool = False, provider: Optional[str] = None, connector: Optional[str] = None, @@ -279,6 +288,7 @@ def register_stack( data_validator: Name of the data validator for this stack. image_builder: Name of the new image builder for this stack. deployer: Name of the deployer for this stack. + log_store: Name of the log store for this stack. set_stack: Immediately set this stack as active. provider: Name of the cloud provider for this stack. connector: Name of the service connector for this stack. @@ -523,6 +533,7 @@ def register_stack( (StackComponentType.DATA_VALIDATOR, data_validator), (StackComponentType.FEATURE_STORE, feature_store), (StackComponentType.IMAGE_BUILDER, image_builder), + (StackComponentType.LOG_STORE, log_store), (StackComponentType.MODEL_DEPLOYER, model_deployer), (StackComponentType.MODEL_REGISTRY, model_registry), (StackComponentType.STEP_OPERATOR, step_operator), diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index 46530599900..be6e9edc9f4 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -13,7 +13,7 @@ # permissions and limitations under the License. """Datadog log store flavor.""" -from typing import Dict, Type +from typing import Type from pydantic import Field, SecretStr @@ -31,7 +31,6 @@ class DatadogLogStoreConfig(OtelLogStoreConfig): Attributes: api_key: Datadog API key for log ingestion. site: Datadog site (e.g., "datadoghq.com", "datadoghq.eu"). - additional_tags: Additional tags to add to all logs. """ api_key: SecretStr = Field( @@ -41,10 +40,6 @@ class DatadogLogStoreConfig(OtelLogStoreConfig): default="datadoghq.com", description="Datadog site (e.g., datadoghq.com, datadoghq.eu)", ) - additional_tags: Dict[str, str] = Field( - default_factory=dict, - description="Additional tags to add to all logs", - ) class DatadogLogStoreFlavor(Flavor): diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 9f0fe697bf6..4571d8892ba 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -14,22 +14,23 @@ """Datadog log store implementation.""" from datetime import datetime -from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +from typing import Any, Dict, List, Optional, cast +import requests +from opentelemetry.sdk._logs import LogData +from opentelemetry.sdk._logs.export import LogExporter, LogExportResult + +from zenml.enums import LoggingLevels from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreConfig from zenml.log_stores.otel.otel_log_store import OtelLogStore from zenml.logger import get_logger - -if TYPE_CHECKING: - from opentelemetry.sdk._logs.export import LogExporter - - from zenml.logging.step_logging import LogEntry - from zenml.models import LogsResponse +from zenml.logging.step_logging import LogEntry +from zenml.models import LogsResponse logger = get_logger(__name__) -class DatadogLogExporter: +class DatadogLogExporter(LogExporter): """Custom log exporter that sends logs to Datadog's HTTP intake API. This exporter transforms OpenTelemetry log records into Datadog's format @@ -40,23 +41,20 @@ def __init__( self, api_key: str, site: str = "datadoghq.com", - additional_tags: Optional[Dict[str, str]] = None, ): """Initialize the Datadog log exporter. Args: api_key: Datadog API key. site: Datadog site domain. - additional_tags: Additional tags to add to all logs. """ - + self.endpoint = f"https://http-intake.logs.{site}/v1/input" self.headers = { "DD-API-KEY": api_key, "Content-Type": "application/json", } - self.additional_tags = additional_tags or {} - def export(self, batch: List[Any]) -> Any: + def export(self, batch: List[LogData]) -> Any: """Export a batch of log records to Datadog. Args: @@ -65,21 +63,6 @@ def export(self, batch: List[Any]) -> Any: Returns: LogExportResult indicating success or failure. """ - try: - import requests - from opentelemetry.sdk._logs.export import LogExportResult - except ImportError: - logger.error( - "Required packages not installed. Install with: " - "pip install requests opentelemetry-sdk" - ) - from opentelemetry.sdk._logs.export import LogExportResult - - return LogExportResult.FAILURE - - if not batch: - return LogExportResult.SUCCESS - logs = [] for log_data in batch: log_record = log_data.log_record @@ -95,16 +78,11 @@ def export(self, batch: List[Any]) -> Any: log_attrs = dict(log_record.attributes) # Combine attributes with additional tags - all_attrs = {**resource_attrs, **log_attrs, **self.additional_tags} + all_attrs = {**resource_attrs, **log_attrs} # Build Datadog log entry log_entry = { "message": str(log_record.body), - "ddsource": "zenml", - "service": resource_attrs.get("service.name", "zenml"), - "hostname": resource_attrs.get( - "service.instance.id", "unknown" - ), } # Add severity if available @@ -184,7 +162,6 @@ def get_exporter(self) -> "LogExporter": return DatadogLogExporter( api_key=self.config.api_key.get_secret_value(), site=self.config.site, - additional_tags=self.config.additional_tags, ) def fetch( @@ -209,16 +186,6 @@ def fetch( Returns: List of log entries from Datadog. """ - try: - import requests - except ImportError: - logger.error( - "requests package not installed. Install with: pip install requests" - ) - return [] - - from zenml.logging.step_logging import LogEntry - # Build query query_parts = [ f"service:{self.config.service_name}", diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index aac9a01570e..75887cbcf36 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -13,7 +13,7 @@ # permissions and limitations under the License. """OpenTelemetry log store flavor.""" -from typing import Dict, Optional, Type +from typing import Type from pydantic import Field @@ -41,10 +41,6 @@ class OtelLogStoreConfig(BaseLogStoreConfig): default="zenml", description="Name of the service for telemetry", ) - service_version: str = Field( - default="1.0.0", - description="Version of the service", - ) max_queue_size: int = Field( default=2048, description="Maximum queue size for batch log processor", @@ -57,18 +53,6 @@ class OtelLogStoreConfig(BaseLogStoreConfig): default=512, description="Maximum batch size for exports", ) - endpoint: Optional[str] = Field( - default=None, - description="OTLP endpoint URL", - ) - headers: Dict[str, str] = Field( - default_factory=dict, - description="Headers for OTLP exporter", - ) - insecure: bool = Field( - default=False, - description="Whether to use insecure connection", - ) class OtelLogStoreFlavor(Flavor): diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 8b7d4880719..17a8be3f456 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -94,7 +94,7 @@ def activate(self, log_request: "LogsRequest") -> None: otel_resource = Resource.create( { "service.name": self.config.service_name, - "service.version": self.config.service_version, + "service.version": "0.91.0", # TODO: Fetch this "zenml.log_id": str(log_request.id), } ) From 1f95b4fc50ccb37541e437bf75dbe5a6ec5e28b3 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Fri, 31 Oct 2025 10:34:09 +0100 Subject: [PATCH 12/81] fixing the secret --- src/zenml/log_stores/datadog/datadog_flavor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index be6e9edc9f4..8a5748cc884 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -15,12 +15,13 @@ from typing import Type -from pydantic import Field, SecretStr +from pydantic import Field from zenml.enums import StackComponentType from zenml.log_stores import BaseLogStore, BaseLogStoreConfig from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.stack.flavor import Flavor +from zenml.utils.secret_utils import PlainSerializedSecretStr, SecretField class DatadogLogStoreConfig(OtelLogStoreConfig): @@ -33,7 +34,7 @@ class DatadogLogStoreConfig(OtelLogStoreConfig): site: Datadog site (e.g., "datadoghq.com", "datadoghq.eu"). """ - api_key: SecretStr = Field( + api_key: PlainSerializedSecretStr = Field( description="Datadog API key for log ingestion", ) site: str = Field( From 4127bbda3061b6e6cc08e099fb76f71c22939305 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 6 Nov 2025 17:17:29 +0100 Subject: [PATCH 13/81] some broken checkpoint --- .../book/component-guide/log-stores/custom.md | 218 +++++++--- src/zenml/log_stores/base_log_store.py | 22 +- .../log_stores/datadog/datadog_flavor.py | 2 +- .../default/artifact_store_exporter.py | 202 ++++++++++ .../log_stores/default/default_log_store.py | 153 ++++++-- src/zenml/log_stores/otel/otel_log_store.py | 120 +++--- src/zenml/logger.py | 371 +++++++----------- .../logging/otel_logging_infrastructure.py | 313 +++++++++++++++ src/zenml/logging/step_logging.py | 72 ++-- 9 files changed, 1054 insertions(+), 419 deletions(-) create mode 100644 src/zenml/log_stores/default/artifact_store_exporter.py create mode 100644 src/zenml/logging/otel_logging_infrastructure.py diff --git a/docs/book/component-guide/log-stores/custom.md b/docs/book/component-guide/log-stores/custom.md index ba628722c81..5f4474d7a40 100644 --- a/docs/book/component-guide/log-stores/custom.md +++ b/docs/book/component-guide/log-stores/custom.md @@ -11,7 +11,9 @@ If you want to send logs to a backend that isn't covered by the built-in log sto The `BaseLogStore` provides three main methods that you need to implement: ```python +import logging from zenml.log_stores import BaseLogStore, BaseLogStoreConfig +from zenml.models import LogsRequest class MyLogStoreConfig(BaseLogStoreConfig): """Configuration for my custom log store.""" @@ -26,35 +28,60 @@ class MyLogStore(BaseLogStore): def config(self) -> MyLogStoreConfig: return cast(MyLogStoreConfig, self._config) - def activate( - self, - pipeline_run_id: UUID, - step_id: Optional[UUID] = None, - source: str = "step", - ) -> None: + def activate(self, log_request: LogsRequest) -> None: """Activate log collection. This is called at the start of a pipeline run or step. - Set up your logging handlers, connections, and any - background processing here. + Set up your logging handlers, connections, and register + with the routing handler. + + Args: + log_request: Contains log ID, URI, and metadata. + """ + from zenml.logging.routing_handler import ( + ensure_routing_handler_installed, + set_active_log_store, + ) + + # Ensure global routing handler is installed + ensure_routing_handler_installed() + + # Initialize your backend connection + self._setup_backend(log_request) + + # Register this log store for current thread + set_active_log_store(self) + + def emit(self, record: logging.LogRecord) -> None: + """Process a log record. + + This is called by the routing handler for each log message. + Send the log to your backend. You can safely use print() + or logger.info() here - reentrancy protection prevents loops. + + Args: + record: The log record to process. """ - pass + # Send log to your backend + self._send_to_backend(record) def deactivate(self) -> None: """Deactivate log collection and clean up. This is called at the end of a pipeline run or step. - Flush any pending logs, close connections, and clean - up resources here. + Flush any pending logs, close connections, and unregister. """ - pass + from zenml.logging.routing_handler import set_active_log_store + + # Unregister from routing handler + set_active_log_store(None) + + # Clean up your backend connection + self._cleanup_backend() def fetch( self, - pipeline_run_id: UUID, - step_id: Optional[UUID] = None, - source: Optional[str] = None, - logs_uri: Optional[str] = None, + logs_model: LogsResponse, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: int = 20000, @@ -63,69 +90,132 @@ class MyLogStore(BaseLogStore): This is called by the server to retrieve logs for display. Query your backend and return logs as LogEntry objects. + + Args: + logs_model: Contains pipeline_run_id, step_id, and metadata. + start_time: Filter logs after this time. + end_time: Filter logs before this time. + limit: Maximum number of logs to return. + + Returns: + List of log entries. """ return [] ``` ### Implementation Patterns -#### 1. Using Python Logging Handlers +#### 1. Direct Implementation (Simple) -The most common pattern is to create a `logging.Handler` that sends logs to your backend: +The simplest pattern is to directly implement the `emit()` method: ```python import logging from zenml.log_stores import BaseLogStore -from zenml.logger import logging_handlers, get_storage_log_level class MyLogStore(BaseLogStore): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.handler = None - self._original_root_level = None + self.backend_client = None - def activate(self, pipeline_run_id, step_id=None, source="step"): - self.handler = MyCustomHandler( - backend_url=self.config.backend_url, - pipeline_run_id=pipeline_run_id, - step_id=step_id, + def activate(self, log_request): + from zenml.logging.routing_handler import ( + ensure_routing_handler_installed, + set_active_log_store, ) - self.handler.setLevel(get_storage_log_level().value) + # Install routing handler + ensure_routing_handler_installed() - root_logger = logging.getLogger() - root_logger.addHandler(self.handler) + # Set up backend connection + self.backend_client = MyBackendClient( + url=self.config.backend_url, + log_id=log_request.id, + ) - self._original_root_level = root_logger.level - handler_levels = [h.level for h in root_logger.handlers] - root_logger.setLevel(min(handler_levels)) + # Register for current thread + set_active_log_store(self) + + def emit(self, record): + """Process each log record.""" + # You can safely use print() or logger.info() here! + # Reentrancy protection prevents infinite loops. - logging_handlers.add(self.handler) + log_data = { + "message": record.getMessage(), + "level": record.levelname, + "timestamp": record.created, + } + + self.backend_client.send_log(log_data) def deactivate(self): - if not self.handler: - return - - root_logger = logging.getLogger() - if self.handler in root_logger.handlers: - root_logger.removeHandler(self.handler) - - if self._original_root_level is not None: - root_logger.setLevel(self._original_root_level) - - logging_handlers.remove(self.handler) + from zenml.logging.routing_handler import set_active_log_store + + if self.backend_client: + self.backend_client.close() + + set_active_log_store(None) ``` -#### 2. Background Processing +#### 2. Using Internal Handlers (Advanced) -For efficient log handling, use background threads or async processing: +If you want to use Python's logging.Handler internally: + +```python +class MyLogStore(BaseLogStore): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._handler = None + + def activate(self, log_request): + from zenml.logging.routing_handler import ( + ensure_routing_handler_installed, + set_active_log_store, + ) + + ensure_routing_handler_installed() + + # Create internal handler (not added to root logger) + self._handler = MyCustomHandler( + backend_url=self.config.backend_url, + log_id=log_request.id, + ) + + set_active_log_store(self) + + def emit(self, record): + """Delegate to internal handler.""" + if self._handler: + self._handler.emit(record) + + def deactivate(self): + from zenml.logging.routing_handler import set_active_log_store + + if self._handler: + self._handler.flush() + self._handler.close() + + set_active_log_store(None) +``` + +#### 3. Background Processing + +For efficient log handling, use background threads for batching: ```python import queue import threading class MyLogStore(BaseLogStore): - def activate(self, pipeline_run_id, step_id=None, source="step"): + def activate(self, log_request): + from zenml.logging.routing_handler import ( + ensure_routing_handler_installed, + set_active_log_store, + ) + + ensure_routing_handler_installed() + self.log_queue = queue.Queue(maxsize=2048) self.shutdown_event = threading.Event() self.worker_thread = threading.Thread( @@ -133,44 +223,58 @@ class MyLogStore(BaseLogStore): daemon=True ) self.worker_thread.start() + + set_active_log_store(self) + + def emit(self, record): + """Queue logs for background processing.""" + try: + self.log_queue.put_nowait(record) + except queue.Full: + pass # Drop logs if queue is full def _process_logs(self): + """Background thread processes queued logs.""" while not self.shutdown_event.is_set(): try: - log_entry = self.log_queue.get(timeout=1) - self._send_to_backend(log_entry) + record = self.log_queue.get(timeout=1) + self._send_to_backend(record) except queue.Empty: continue def deactivate(self): + from zenml.logging.routing_handler import set_active_log_store + self.shutdown_event.set() if self.worker_thread: self.worker_thread.join(timeout=5) + + set_active_log_store(None) ``` -#### 3. Fetching Logs +#### 4. Fetching Logs Implement fetch using HTTP APIs or SDKs: ```python +import requests from zenml.logging.step_logging import LogEntry class MyLogStore(BaseLogStore): def fetch( self, - pipeline_run_id, - step_id=None, - source=None, - logs_uri=None, + logs_model, start_time=None, end_time=None, limit=20000, ): + """Fetch logs from your backend.""" query = { - "pipeline_run_id": str(pipeline_run_id), + "pipeline_run_id": str(logs_model.pipeline_run_id), } - if step_id: - query["step_id"] = str(step_id) + + if logs_model.step_run_id: + query["step_id"] = str(logs_model.step_run_id) if start_time: query["start_time"] = start_time.isoformat() if end_time: diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index e4e655a5ec5..9fadf3907e0 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -13,9 +13,10 @@ # permissions and limitations under the License. """Base class for log stores.""" +import logging from abc import abstractmethod from datetime import datetime -from typing import TYPE_CHECKING, List, Optional, Type, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, cast from zenml.enums import StackComponentType from zenml.stack import Flavor, StackComponent, StackComponentConfig @@ -68,6 +69,25 @@ def deactivate(self) -> None: any background threads or connections. """ + def emit(self, record: logging.LogRecord) -> None: + """Process a log record from the routing handler. + + This method is called by the ZenML routing handler for each log + record that should be stored by this log store. Implementations + should process the record according to their backend's requirements. + + The default implementation does nothing. This allows log stores that + only need to collect logs during pipeline execution (via activate/ + deactivate) without real-time processing to skip implementing this. + + Args: + record: The Python logging record to process. + """ + # Default: do nothing + # This is NOT abstract, so implementations can opt-in + pass + + @abstractmethod def fetch( self, diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index 8a5748cc884..3c68c27e776 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -21,7 +21,7 @@ from zenml.log_stores import BaseLogStore, BaseLogStoreConfig from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.stack.flavor import Flavor -from zenml.utils.secret_utils import PlainSerializedSecretStr, SecretField +from zenml.utils.secret_utils import PlainSerializedSecretStr class DatadogLogStoreConfig(OtelLogStoreConfig): diff --git a/src/zenml/log_stores/default/artifact_store_exporter.py b/src/zenml/log_stores/default/artifact_store_exporter.py new file mode 100644 index 00000000000..5a97bf84cd7 --- /dev/null +++ b/src/zenml/log_stores/default/artifact_store_exporter.py @@ -0,0 +1,202 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""OpenTelemetry exporter that writes logs to ZenML artifact store.""" + +import time +from datetime import datetime +from typing import TYPE_CHECKING, Optional, Sequence + +from opentelemetry.sdk._logs.export import LogExporter, LogExportResult + +if TYPE_CHECKING: + from opentelemetry.sdk._logs import LogData + + from zenml.artifact_stores import BaseArtifactStore + +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +class ArtifactStoreExporter(LogExporter): + """OpenTelemetry exporter that writes logs to ZenML artifact store. + + Replaces the custom LogsStorage implementation with a standard + OpenTelemetry exporter. Logs are batched by BatchLogRecordProcessor + and written to the artifact store. + """ + + def __init__( + self, + logs_uri: str, + artifact_store: "BaseArtifactStore", + ): + """Initialize the artifact store exporter. + + Args: + logs_uri: URI where logs should be written. + artifact_store: The artifact store to write to. + """ + self.logs_uri = logs_uri + self.artifact_store = artifact_store + self.log_buffer: list[str] = [] + self.file_counter = 0 + + def export(self, batch: Sequence["LogData"]) -> LogExportResult: + """Export a batch of logs to the artifact store. + + Args: + batch: Sequence of LogData to export. + + Returns: + LogExportResult indicating success or failure. + """ + if not batch: + return LogExportResult.SUCCESS + + try: + # Format logs + log_lines = [] + for log_data in batch: + log_record = log_data.log_record + + # Format as ZenML log entry + log_line = self._format_log_entry( + message=str(log_record.body) if log_record.body else "", + level=log_record.severity_text, + timestamp_ns=log_record.timestamp, + ) + log_lines.append(log_line) + + # Write to artifact store + if log_lines: + self._write_to_artifact_store(log_lines) + + return LogExportResult.SUCCESS + + except Exception as e: + logger.error(f"Failed to export logs to artifact store: {e}") + return LogExportResult.FAILURE + + def _format_log_entry( + self, + message: str, + level: Optional[str], + timestamp_ns: Optional[int], + ) -> str: + """Format a log entry in ZenML format. + + Args: + message: The log message. + level: The log level (DEBUG, INFO, etc.). + timestamp_ns: Timestamp in nanoseconds. + + Returns: + Formatted log line. + """ + # Convert timestamp + if timestamp_ns: + timestamp = datetime.fromtimestamp(timestamp_ns / 1e9) + timestamp_str = timestamp.strftime("%Y-%m-%d %H:%M:%S,%f")[:-3] + else: + timestamp_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[ + :-3 + ] + + # Map OTel severity to ZenML level + zenml_level = self._map_severity_to_level(level) + + # Format: timestamp|level|message + return f"{timestamp_str}|{zenml_level}|{message}" + + def _map_severity_to_level(self, severity: Optional[str]) -> str: + """Map OpenTelemetry severity to ZenML log level. + + Args: + severity: OTel severity text. + + Returns: + ZenML log level string. + """ + if not severity: + return "INFO" + + severity_upper = severity.upper() + if "DEBUG" in severity_upper or "TRACE" in severity_upper: + return "DEBUG" + elif "INFO" in severity_upper: + return "INFO" + elif "WARN" in severity_upper: + return "WARNING" + elif "ERROR" in severity_upper: + return "ERROR" + elif ( + "CRITICAL" in severity_upper + or "FATAL" in severity_upper + or "EMERGENCY" in severity_upper + ): + return "CRITICAL" + else: + return "INFO" + + def _write_to_artifact_store(self, log_lines: list[str]) -> None: + """Write log lines to the artifact store. + + Args: + log_lines: List of formatted log lines. + """ + # Create unique file name with timestamp + timestamp = int(time.time() * 1000) + self.file_counter += 1 + file_uri = f"{self.logs_uri}.{timestamp}.{self.file_counter}" + + # Join lines and write + content = "\n".join(log_lines) + "\n" + + try: + # Write to artifact store + with self.artifact_store.open(file_uri, "w") as f: + f.write(content) + + logger.debug(f"Wrote {len(log_lines)} log lines to {file_uri}") + except Exception as e: + logger.error(f"Failed to write logs to {file_uri}: {e}") + raise + + def shutdown(self) -> None: + """Shutdown the exporter and flush any remaining logs.""" + if self.log_buffer: + try: + self._write_to_artifact_store(self.log_buffer) + self.log_buffer.clear() + except Exception as e: + logger.warning(f"Error during shutdown flush: {e}") + + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Force flush any buffered logs. + + Args: + timeout_millis: Timeout in milliseconds. + + Returns: + True if successful. + """ + try: + if self.log_buffer: + self._write_to_artifact_store(self.log_buffer) + self.log_buffer.clear() + return True + except Exception as e: + logger.warning(f"Force flush failed: {e}") + return False diff --git a/src/zenml/log_stores/default/default_log_store.py b/src/zenml/log_stores/default/default_log_store.py index 43279430082..91c4656b76b 100644 --- a/src/zenml/log_stores/default/default_log_store.py +++ b/src/zenml/log_stores/default/default_log_store.py @@ -61,11 +61,15 @@ LogsRequest, LogsResponse, ) +from zenml.utils.io_utils import sanitize_remote_path from zenml.utils.time_utils import utc_now from zenml.zen_stores.base_zen_store import BaseZenStore if TYPE_CHECKING: from zenml.artifact_stores import BaseArtifactStore + from zenml.log_stores.default.artifact_store_exporter import ( + ArtifactStoreExporter, + ) from zenml.logging.step_logging import ( ArtifactStoreHandler, LogEntry, @@ -80,6 +84,44 @@ LOGS_EXTENSION = ".log" +def prepare_logs_uri( + artifact_store: "BaseArtifactStore", + log_id: UUID, +) -> str: + """Generates and prepares a URI for the log file or folder for a step. + + Args: + artifact_store: The artifact store on which the artifact will be stored. + log_id: The ID of the logs entity + + Returns: + The URI of the log storage (file or folder). + """ + logs_base_uri = os.path.join(artifact_store.path, "logs") + + if not artifact_store.exists(logs_base_uri): + artifact_store.makedirs(logs_base_uri) + + if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + logs_uri = os.path.join(logs_base_uri, log_id) + if artifact_store.exists(logs_uri): + logger.warning( + f"Logs directory {logs_uri} already exists! Removing old log directory..." + ) + artifact_store.rmtree(logs_uri) + + artifact_store.makedirs(logs_uri) + else: + logs_uri = os.path.join(logs_base_uri, f"{log_id}{LOGS_EXTENSION}") + if artifact_store.exists(logs_uri): + logger.warning( + f"Logs file {logs_uri} already exists! Removing old log file..." + ) + artifact_store.remove(logs_uri) + + return sanitize_remote_path(logs_uri) + + def remove_ansi_escape_codes(text: str) -> str: """Auxiliary function to remove ANSI escape codes from a given string. @@ -644,10 +686,9 @@ def _split_to_chunks(self, message: str) -> List[str]: class DefaultLogStore(BaseLogStore): """Log store that saves logs to the artifact store. - This implementation uses the artifact store as the backend for log storage, - maintaining backward compatibility with existing ZenML behavior. Logs are - written to the artifact store using a background thread and queue for - efficient batching. + This implementation uses OpenTelemetry infrastructure to write logs + to the artifact store. Uses shared BatchLogRecordProcessor with + thread pool for efficient parallel exports. """ def __init__(self, *args: Any, **kwargs: Any) -> None: @@ -657,10 +698,10 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: *args: Positional arguments for the base class. **kwargs: Keyword arguments for the base class. """ - self.storage: Optional["LogsStorage"] = None - self.handler: Optional["ArtifactStoreHandler"] = None - - self._original_root_level: Optional[int] = None + super().__init__(*args, **kwargs) + self._exporter: Optional["ArtifactStoreExporter"] = None + self._handler: Optional[logging.Handler] = None + self._log_id: Optional[str] = None @property def config(self) -> DefaultLogStoreConfig: @@ -677,50 +718,94 @@ def activate(self, log_request: "LogsRequest") -> None: Args: log_request: The log request model. """ - # Create storage and handler - self.storage = LogsStorage( + from opentelemetry.sdk._logs import LoggingHandler + from opentelemetry.sdk.resources import Resource + + from zenml.log_stores.default.artifact_store_exporter import ( + ArtifactStoreExporter, + ) + from zenml.logging.otel_logging_infrastructure import ( + get_shared_otel_infrastructure, + ) + from zenml.logging.routing_handler import set_active_log_store + + # Get shared OTel infrastructure + logger_provider, routing_exporter = get_shared_otel_infrastructure() + + # Create artifact store exporter for this log store + self._exporter = ArtifactStoreExporter( logs_uri=log_request.uri, artifact_store=Client().active_stack.artifact_store, ) - self.handler = ArtifactStoreHandler(self.storage) - # Add handler to root logger - root_logger = logging.getLogger() - root_logger.addHandler(self.handler) + # Register exporter with routing exporter + self._log_id = str(log_request.id) + routing_exporter.register_exporter(self._log_id, self._exporter) + + # Create resource with log_id and LoggerProvider + from opentelemetry.sdk._logs import LoggerProvider + + resource = Resource.create({"zenml.log_id": self._log_id}) + self._logger_provider_with_resource = LoggerProvider(resource=resource) + + # Share the same processor (routing exporter) from the global provider + for processor in ( + logger_provider._multi_log_record_processor._log_record_processors + ): + self._logger_provider_with_resource.add_log_record_processor( + processor + ) + + self._handler = LoggingHandler( + level=get_storage_log_level().value, + logger_provider=self._logger_provider_with_resource, + ) - # Set root logger level to minimum of all handlers - self._original_root_level = root_logger.level - handler_levels = [handler.level for handler in root_logger.handlers] - min_level = min(handler_levels) - if min_level < root_logger.level: - root_logger.setLevel(min_level) + # Register this log store for routing + set_active_log_store(self) # Add to context variables for print capture - logging_handlers.add(self.handler) + logging_handlers.add(self._handler) + + def emit(self, record: logging.LogRecord) -> None: + """Process a log record by sending to artifact store. + + Args: + record: The log record to process. + """ + if self._handler: + try: + self._handler.emit(record) + except Exception: + # Don't let logging errors break execution + pass def deactivate(self) -> None: """Deactivate log collection and flush remaining logs.""" - if not self.handler: + if not self._handler: return - # Remove handler from root logger - root_logger = logging.getLogger() - if self.handler in root_logger.handlers: - root_logger.removeHandler(self.handler) + # Unregister from the current thread's context + from zenml.logging.otel_logging_infrastructure import ( + get_shared_otel_infrastructure, + ) + from zenml.logging.routing_handler import set_active_log_store - # Restore original root logger level - if self._original_root_level is not None: - root_logger.setLevel(self._original_root_level) + set_active_log_store(None) # Remove from context variables - logging_handlers.remove(self.handler) + logging_handlers.remove(self._handler) + + # Unregister exporter from routing + if self._log_id and self._exporter: + _, routing_exporter = get_shared_otel_infrastructure() + routing_exporter.unregister_exporter(self._log_id) - # Shutdown storage thread (flushes and merges logs) - if self.storage: + # Flush exporter try: - self.storage._shutdown_log_storage_thread() + self._exporter.force_flush() except Exception as e: - logger.warning(f"Error shutting down log storage: {e}") + logger.warning(f"Error flushing exporter: {e}") logger.debug("DefaultLogStore deactivated") diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 17a8be3f456..82bbbbcfe18 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -17,10 +17,8 @@ from abc import abstractmethod from datetime import datetime from typing import TYPE_CHECKING, Any, List, Optional, cast -from uuid import UUID from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler -from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource from zenml.log_stores.base_log_store import BaseLogStore @@ -57,12 +55,10 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: **kwargs: Keyword arguments for the base class. """ super().__init__(*args, **kwargs) - self.logger_provider: Optional["LoggerProvider"] = None - self.handler: Optional[logging.Handler] = None - self._original_root_level: Optional[int] = None - self._pipeline_run_id: Optional[UUID] = None - self._step_id: Optional[UUID] = None - self._source: Optional[str] = None + self._logger_provider_with_resource: Optional["LoggerProvider"] = None + self._handler: Optional[logging.Handler] = None + self._exporter: Optional["LogExporter"] = None + self._log_id: Optional[str] = None @property def config(self) -> OtelLogStoreConfig: @@ -90,76 +86,94 @@ def activate(self, log_request: "LogsRequest") -> None: Args: log_request: The log request model. """ - # Create resource + from zenml.logging.otel_logging_infrastructure import ( + get_shared_otel_infrastructure, + ) + from zenml.logging.routing_handler import set_active_log_store + + # Get shared OTel infrastructure + logger_provider, routing_exporter = get_shared_otel_infrastructure() + + # Get exporter for this log store + self._exporter = self.get_exporter() + + # Register exporter with routing exporter + self._log_id = str(log_request.id) + routing_exporter.register_exporter(self._log_id, self._exporter) + + # Create resource with log_id and service info otel_resource = Resource.create( { "service.name": self.config.service_name, "service.version": "0.91.0", # TODO: Fetch this - "zenml.log_id": str(log_request.id), + "zenml.log_id": self._log_id, } ) - # Create logger provider - self.logger_provider = LoggerProvider(resource=otel_resource) - - # Get exporter - exporter = self.get_exporter() - - # Create batch processor for efficient background processing - processor = BatchLogRecordProcessor( - exporter, - max_queue_size=self.config.max_queue_size, - schedule_delay_millis=self.config.schedule_delay_millis, - max_export_batch_size=self.config.max_export_batch_size, + # Create logger provider with this resource + self._logger_provider_with_resource = LoggerProvider( + resource=otel_resource ) - self.logger_provider.add_log_record_processor(processor) - # Create handler for Python logging integration - self.handler = LoggingHandler( + # Share the same processor (routing exporter) from the global provider + for processor in ( + logger_provider._multi_log_record_processor._log_record_processors + ): + self._logger_provider_with_resource.add_log_record_processor( + processor + ) + + # Create handler + self._handler = LoggingHandler( level=get_storage_log_level().value, - logger_provider=self.logger_provider, + logger_provider=self._logger_provider_with_resource, ) - # Add handler to root logger - root_logger = logging.getLogger() - root_logger.addHandler(self.handler) - - # Set root logger level to minimum of all handlers - self._original_root_level = root_logger.level - handler_levels = [handler.level for handler in root_logger.handlers] - min_level = min(handler_levels) - if min_level < root_logger.level: - root_logger.setLevel(min_level) + # Register this log store for routing + set_active_log_store(self) # Add to context variables for print capture - logging_handlers.add(self.handler) + logging_handlers.add(self._handler) + + def emit(self, record: logging.LogRecord) -> None: + """Process a log record by sending to OpenTelemetry. + + Args: + record: The log record to process. + """ + if self._handler: + try: + self._handler.emit(record) + except Exception: + # Don't let logging errors break execution + pass def deactivate(self) -> None: """Deactivate log collection and flush remaining logs.""" - if not self.handler: + if not self._handler: return - # Remove handler from root logger - root_logger = logging.getLogger() - if self.handler in root_logger.handlers: - root_logger.removeHandler(self.handler) + # Unregister from the current thread's context + from zenml.logging.otel_logging_infrastructure import ( + get_shared_otel_infrastructure, + ) + from zenml.logging.routing_handler import set_active_log_store - # Restore original root logger level - if self._original_root_level is not None: - root_logger.setLevel(self._original_root_level) + set_active_log_store(None) # Remove from context variables - logging_handlers.remove(self.handler) + logging_handlers.remove(self._handler) + + # Unregister exporter from routing + if self._log_id and self._exporter: + _, routing_exporter = get_shared_otel_infrastructure() + routing_exporter.unregister_exporter(self._log_id) - # Flush and shutdown logger provider - if self.logger_provider: + # Flush exporter try: - self.logger_provider.force_flush() - self.logger_provider.shutdown() + self._exporter.force_flush() except Exception as e: - logger.warning( - f"Error shutting down OTel logger provider: {e}" - ) + logger.warning(f"Error flushing exporter: {e}") logger.debug("OtelLogStore deactivated") diff --git a/src/zenml/logger.py b/src/zenml/logger.py index 24bdd1ef7c8..63f78e8fa04 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -13,43 +13,58 @@ # permissions and limitations under the License. """Logger implementation.""" -import builtins +import json import logging import os -import re import sys from contextvars import ContextVar -from typing import TYPE_CHECKING, Any, Dict - -if TYPE_CHECKING: - from zenml.logging.step_logging import ArtifactStoreHandler +from typing import Any, Dict, Optional from rich.traceback import install as rich_tb_install from zenml.constants import ( ENABLE_RICH_TRACEBACK, - ENV_ZENML_CAPTURE_PRINTS, ENV_ZENML_LOGGING_COLORS_DISABLED, - ENV_ZENML_LOGGING_FORMAT, ENV_ZENML_SUPPRESS_LOGS, ZENML_LOGGING_VERBOSITY, - ZENML_STORAGE_LOGGING_VERBOSITY, handle_bool_env_var, ) from zenml.enums import LoggingLevels -from zenml.utils.context_utils import ContextVarList ZENML_LOGGING_COLORS_DISABLED = handle_bool_env_var( ENV_ZENML_LOGGING_COLORS_DISABLED, False ) +# Logic for formatting console messages step_names_in_console: ContextVar[bool] = ContextVar( "step_names_in_console", default=False ) -logging_handlers: ContextVarList["ArtifactStoreHandler"] = ContextVarList( - "logging_handlers" -) + +grey: str = "\x1b[90m" +white: str = "\x1b[37m" +pink: str = "\x1b[35m" +green: str = "\x1b[32m" +yellow: str = "\x1b[33m" +red: str = "\x1b[31m" +cyan: str = "\x1b[1;36m" +bold_red: str = "\x1b[31;1m" +purple: str = "\x1b[38;5;105m" +blue: str = "\x1b[34m" +reset: str = "\x1b[0m" + +COLORS: Dict[LoggingLevels, str] = { + LoggingLevels.DEBUG: grey, + LoggingLevels.INFO: white, + LoggingLevels.WARN: yellow, + LoggingLevels.ERROR: red, + LoggingLevels.CRITICAL: bold_red, +} + +_original_stdout_write: Optional[Any] = None +_original_stderr_write: Optional[Any] = None +_stdout_wrapped: bool = False +_stderr_wrapped: bool = False def _add_step_name_to_message(message: str) -> str: @@ -82,111 +97,29 @@ def _add_step_name_to_message(message: str) -> str: return message -class CustomFormatter(logging.Formatter): - """Formats logs according to custom specifications.""" - - grey: str = "\x1b[90m" - white: str = "\x1b[37m" - pink: str = "\x1b[35m" - green: str = "\x1b[32m" - yellow: str = "\x1b[33m" - red: str = "\x1b[31m" - cyan: str = "\x1b[1;36m" - bold_red: str = "\x1b[31;1m" - purple: str = "\x1b[38;5;105m" - blue: str = "\x1b[34m" - reset: str = "\x1b[0m" - - def _get_format_template(self, record: logging.LogRecord) -> str: - """Get the format template based on the logging level. +def format_console_message(message: str) -> str: + """Format a message for console output. - Args: - record: The log record to format. - - Returns: - The format template string. - """ - # Only include location info for DEBUG level - if get_logging_level() == LoggingLevels.DEBUG: - return "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)" - else: - return "%(message)s" - - COLORS: Dict[LoggingLevels, str] = { - LoggingLevels.DEBUG: grey, - LoggingLevels.INFO: white, - LoggingLevels.WARN: yellow, - LoggingLevels.ERROR: red, - LoggingLevels.CRITICAL: bold_red, - } + Args: + message: The message to format. - def format(self, record: logging.LogRecord) -> str: - """Converts a log record to a (colored) string. + Returns: + The formatted message. + """ + return message - Args: - record: LogRecord generated by the code. - Returns: - A string formatted according to specifications. - """ - # Get the template - format_template = self._get_format_template(record) - - # Apply step name prepending if enabled (for console display) - message = record.getMessage() - try: - if step_names_in_console.get(): - message = _add_step_name_to_message(message) - except Exception: - # If we can't get step context, just use the original message - pass - - # Create a new record with the modified message - modified_record = logging.LogRecord( - name=record.name, - level=record.levelno, - pathname=record.pathname, - lineno=record.lineno, - msg=message, - args=(), - exc_info=record.exc_info, - ) +# Logger utilities +def get_logger(logger_name: str) -> logging.Logger: + """Main function to get logger name,. - if ZENML_LOGGING_COLORS_DISABLED: - # If color formatting is disabled, use the default format without colors - formatter = logging.Formatter(format_template) - return formatter.format(modified_record) - else: - # Use color formatting - log_fmt = ( - self.COLORS[LoggingLevels(record.levelno)] - + format_template - + self.reset - ) - formatter = logging.Formatter(log_fmt) - formatted_message = formatter.format(modified_record) - quoted_groups = re.findall("`([^`]*)`", formatted_message) - for quoted in quoted_groups: - formatted_message = formatted_message.replace( - "`" + quoted + "`", - self.reset - + self.purple - + quoted - + self.COLORS.get(LoggingLevels(record.levelno)), - ) + Args: + logger_name: Name of logger to initialize. - # Format URLs - url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" - urls = re.findall(url_pattern, formatted_message) - for url in urls: - formatted_message = formatted_message.replace( - url, - self.reset - + self.blue - + url - + self.COLORS.get(LoggingLevels(record.levelno)), - ) - return formatted_message + Returns: + A logger object. + """ + return logging.getLogger(logger_name) def get_logging_level() -> LoggingLevels: @@ -206,23 +139,6 @@ def get_logging_level() -> LoggingLevels: return LoggingLevels[verbosity] -def get_storage_log_level() -> LoggingLevels: - """Get storage logging level from the env variable with safe fallback. - - Returns: - The storage logging level, defaulting to INFO if invalid. - - Raises: - KeyError: If the storage logging level is not found. - """ - verbosity = ZENML_STORAGE_LOGGING_VERBOSITY.upper() - if verbosity not in LoggingLevels.__members__: - raise KeyError( - f"Verbosity must be one of {list(LoggingLevels.__members__.keys())}" - ) - return LoggingLevels[verbosity] - - def set_root_verbosity() -> None: """Set the root verbosity.""" level = get_logging_level() @@ -240,134 +156,129 @@ def set_root_verbosity() -> None: get_logger(__name__).debug("Logging NOTSET") -def wrapped_print(*args: Any, **kwargs: Any) -> None: - """Wrapped print function. - - Args: - *args: Arguments to print - **kwargs: Keyword arguments for print - """ - original_print = getattr(builtins, "_zenml_original_print") - - file_arg = kwargs.get("file", sys.stdout) +class ZenMLFormatter(logging.Formatter): + """Formats logs according to custom specifications.""" - # IMPORTANT: Don't intercept internal calls to any objects - # other than sys.stdout and sys.stderr. This is especially - # critical for handling tracebacks. The default logging - # formatter uses StringIO to format tracebacks, we don't - # want to intercept it and create a LogRecord about it. - if file_arg not in (sys.stdout, sys.stderr): - original_print(*args, **kwargs) + def format(self, record: logging.LogRecord) -> str: + """Converts a log record to a (colored) string or structured JSON. - # Convert print arguments to message - message = " ".join(str(arg) for arg in args) + Args: + record: LogRecord generated by the code. - # Call active handlers first (for storage) - if message.strip(): - handlers = logging_handlers.get() + Returns: + A string formatted according to specifications. + """ + data = { + "zenml": True, + "timestamp": self.formatTime(record, datefmt="%Y-%m-%dT%H:%M:%S"), + "level": record.levelname, + "name": record.name, + "message": record.getMessage(), + } + + if record.exc_info: + data["exc_info"] = self.formatException(record.exc_info) + + return json.dumps(data, ensure_ascii=False) + + +def _wrapped_write(original_write: Any, stream_name: str) -> Any: + """Wrap stdout/stderr write method to parse and route logs.""" + from zenml.logging.step_logging import get_active_log_store, LogEntry + from zenml.utils import utc_now + + def wrapped_write(text: str) -> int: + """Wrap the write method to parse and route logs.""" + message = text + name = None + level = ( + LoggingLevels.INFO + if stream_name == "info" + else LoggingLevels.ERROR + ) + timestamp = utc_now() - for handler in handlers: + # Try to extract the message from a potential JSONified log entry + if text.startswith("{") and text.endswith("}"): try: - # Create a LogRecord for the handler - record = logging.LogRecord( - name="print", - level=logging.ERROR - if file_arg == sys.stderr - else logging.INFO, - pathname="", - lineno=0, - msg=message, - args=(), - exc_info=None, - ) - # Check if handler's level would accept this record - if record.levelno >= handler.level: - handler.emit(record) - except Exception: - # Don't let handler errors break print - pass + data = json.loads(text) - if step_names_in_console.get(): - message = _add_step_name_to_message(message) - - # Then call original print for console display - original_print(message, *args[1:], **kwargs) + if "zenml" in data and "message" in data: + message = data["message"] + name = data.get("name", None) + level = data.get("level", level) + timestamp = data.get("timestamp", timestamp) + else: + message = data + except Exception: + message = text + + # If there is an active log store + if log_store := get_active_log_store(): + log_store.emit( + LogEntry( + message=message, + name=name, + level=level, + timestamp=timestamp, + ) + ) -def setup_global_print_wrapping() -> None: - """Set up global print() wrapping with context-aware handlers.""" - capture_prints = handle_bool_env_var( - ENV_ZENML_CAPTURE_PRINTS, default=True - ) + # Format the message for console output + message = format_console_message(message) - if not capture_prints or hasattr(__builtins__, "_zenml_original_print"): - return + return original_write(message) - # Store original and replace print - setattr(builtins, "_zenml_original_print", builtins.print) - setattr(builtins, "print", wrapped_print) + return wrapped_write -def get_formatter() -> logging.Formatter: - """Get a configured logging formatter. +def wrap_stdout_stderr() -> None: + """Wrap stdout and stderr write methods.""" + global _stdout_wrapped, _stderr_wrapped + global _original_stdout_write, _original_stderr_write - Returns: - The formatter. - """ - if log_format := os.environ.get(ENV_ZENML_LOGGING_FORMAT, None): - return logging.Formatter(fmt=log_format) - else: - return CustomFormatter() + if not _stdout_wrapped: + _original_stdout_write = getattr(sys.stdout, "write") + setattr( + sys.stdout, + "write", + _wrapped_write(_original_stdout_write, "info"), + ) + _stdout_wrapped = True + + if not _stderr_wrapped: + _original_stderr_write = getattr(sys.stderr, "write") + setattr( + sys.stderr, + "write", + _wrapped_write(_original_stderr_write, "error"), + ) + _stderr_wrapped = True -def get_console_handler() -> Any: +def get_zenml_handler() -> Any: """Get console handler for logging. Returns: A console handler. """ - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setFormatter(get_formatter()) - # Set console handler level explicitly to console verbosity - console_handler.setLevel(get_logging_level().value) - return console_handler - - -def get_logger(logger_name: str) -> logging.Logger: - """Main function to get logger name,. - - Args: - logger_name: Name of logger to initialize. - - Returns: - A logger object. - """ - return logging.getLogger(logger_name) + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(ZenMLFormatter()) + return handler def init_logging() -> None: - """Initialize logging with default levels.""" - # Mute tensorflow cuda warnings - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + """Initialize the logging system.""" set_root_verbosity() + wrap_stdout_stderr() - # Check if console handler already exists to avoid duplicates + # Add the ZenML handler to the root logger root_logger = logging.getLogger() - has_console_handler = any( - isinstance(handler, logging.StreamHandler) - and handler.stream == sys.stdout - for handler in root_logger.handlers - ) - - if not has_console_handler: - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setFormatter(get_formatter()) - # Set console handler level explicitly to console verbosity - console_handler.setLevel(get_logging_level().value) - root_logger.addHandler(console_handler) + root_logger.addHandler(get_zenml_handler()) - # Initialize global print wrapping - setup_global_print_wrapping() + # Mute tensorflow cuda warnings + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Enable logs if environment variable SUPPRESS_ZENML_LOGS is not set to True suppress_zenml_logs: bool = handle_bool_env_var( diff --git a/src/zenml/logging/otel_logging_infrastructure.py b/src/zenml/logging/otel_logging_infrastructure.py new file mode 100644 index 00000000000..26d90442396 --- /dev/null +++ b/src/zenml/logging/otel_logging_infrastructure.py @@ -0,0 +1,313 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""Shared OpenTelemetry logging infrastructure for all log stores. + +Provides a unified backend using a single BatchLogRecordProcessor. +All log stores share this infrastructure, routing logs by log_id to +specific exporters. +""" + +import concurrent.futures +import threading +import time +from typing import TYPE_CHECKING, Dict, Optional, Sequence + +from opentelemetry.sdk._logs import LoggerProvider +from opentelemetry.sdk._logs.export import ( + BatchLogRecordProcessor, + LogExporter, + LogExportResult, +) + +if TYPE_CHECKING: + from opentelemetry.sdk._logs import LogData + +from zenml.logger import get_logger + +logger = get_logger(__name__) + +# Global shared infrastructure (singleton per process) +_shared_logger_provider: Optional[LoggerProvider] = None +_routing_exporter: Optional["RoutingLogExporter"] = None +_infrastructure_lock = threading.Lock() + + +class RoutingLogExporter(LogExporter): + """Routes logs to different exporters based on log_id. + + Processes exports in parallel using a thread pool for better performance + when multiple log stores are active. + """ + + def __init__(self, max_concurrent_exporters: int = 10): + """Initialize the routing exporter with thread pool. + + Args: + max_concurrent_exporters: Maximum number of exporters to run in parallel. + """ + self._exporters: Dict[str, LogExporter] = {} + self._lock = threading.Lock() + self._executor = concurrent.futures.ThreadPoolExecutor( + max_workers=max_concurrent_exporters, + thread_name_prefix="zenml-log-export", + ) + self._export_count = 0 + self._slow_export_count = 0 + + def register_exporter(self, log_id: str, exporter: LogExporter) -> None: + """Register an exporter for a specific log_id. + + Args: + log_id: Unique identifier for the log store. + exporter: The exporter to handle logs for this log_id. + """ + with self._lock: + self._exporters[log_id] = exporter + logger.debug(f"Registered exporter for log_id: {log_id}") + + def unregister_exporter(self, log_id: str) -> None: + """Unregister an exporter for a specific log_id. + + Args: + log_id: The log_id to unregister. + """ + with self._lock: + exporter = self._exporters.pop(log_id, None) + if exporter: + logger.debug(f"Unregistered exporter for log_id: {log_id}") + + def export(self, batch: Sequence["LogData"]) -> LogExportResult: + """Route logs to appropriate exporters based on log_id. + + Logs are grouped by log_id from the Resource attributes, then + exported in parallel using the thread pool. + + Args: + batch: Sequence of LogData to export. + + Returns: + LogExportResult indicating success or failure. + """ + if not batch: + return LogExportResult.SUCCESS + + self._export_count += 1 + start_time = time.time() + + # Group logs by log_id + logs_by_id: Dict[str, list] = {} + + for log_data in batch: + # Extract log_id from Resource attributes + log_id = None + if log_data.log_record.resource: + attrs = dict(log_data.log_record.resource.attributes) + log_id = attrs.get("zenml.log_id") + + if log_id: + logs_by_id.setdefault(log_id, []).append(log_data) + else: + logger.debug("Received log without zenml.log_id") + + # Submit all exports to thread pool in parallel + futures = [] + with self._lock: + for log_id, logs in logs_by_id.items(): + exporter = self._exporters.get(log_id) + if exporter: + # Submit to thread pool (non-blocking) + future = self._executor.submit( + self._safe_export, exporter, logs, log_id + ) + futures.append(future) + + # Wait for all exports to complete + all_success = True + timeout = 30 # seconds total for all exports + + try: + for future in concurrent.futures.as_completed( + futures, timeout=timeout + ): + try: + result = future.result(timeout=1) + if result != LogExportResult.SUCCESS: + all_success = False + except concurrent.futures.TimeoutError: + logger.error("Export timeout waiting for result") + all_success = False + except Exception as e: + logger.error(f"Export failed: {e}") + all_success = False + except concurrent.futures.TimeoutError: + logger.error(f"Exports took longer than {timeout}s timeout") + all_success = False + + # Monitor performance + duration = time.time() - start_time + if duration > 1.5: # Slower than batch interval + self._slow_export_count += 1 + if self._slow_export_count % 10 == 0: + logger.warning( + f"Slow exports detected: {duration:.2f}s " + f"(total slow: {self._slow_export_count}/{self._export_count})" + ) + + return ( + LogExportResult.SUCCESS if all_success else LogExportResult.FAILURE + ) + + def _safe_export( + self, exporter: LogExporter, logs: Sequence["LogData"], log_id: str + ) -> LogExportResult: + """Safely export logs with error handling. + + Args: + exporter: The exporter to use. + logs: Logs to export. + log_id: ID for logging purposes. + + Returns: + Export result. + """ + try: + return exporter.export(logs) + except Exception as e: + logger.error(f"Export failed for log_id {log_id}: {e}") + return LogExportResult.FAILURE + + def shutdown(self) -> None: + """Shutdown the routing exporter and thread pool.""" + logger.debug("Shutting down routing exporter thread pool") + try: + self._executor.shutdown(wait=True, timeout=30) + except Exception as e: + logger.warning(f"Error shutting down thread pool: {e}") + + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Force flush any buffered logs. + + Args: + timeout_millis: Timeout in milliseconds. + + Returns: + True if successful. + """ + # Flush all registered exporters in parallel + futures = [] + with self._lock: + for exporter in self._exporters.values(): + future = self._executor.submit( + self._safe_flush, exporter, timeout_millis + ) + futures.append(future) + + # Wait for all flushes + all_success = True + timeout_sec = timeout_millis / 1000.0 + try: + for future in concurrent.futures.as_completed( + futures, timeout=timeout_sec + ): + try: + if not future.result(timeout=1): + all_success = False + except Exception as e: + logger.warning(f"Force flush failed: {e}") + all_success = False + except concurrent.futures.TimeoutError: + logger.warning("Force flush timeout") + all_success = False + + return all_success + + def _safe_flush(self, exporter: LogExporter, timeout_millis: int) -> bool: + """Safely flush an exporter with error handling. + + Args: + exporter: The exporter to flush. + timeout_millis: Timeout in milliseconds. + + Returns: + True if successful. + """ + try: + return exporter.force_flush(timeout_millis) + except Exception as e: + logger.warning(f"Flush failed: {e}") + return False + + +def get_shared_otel_infrastructure() -> tuple[ + LoggerProvider, RoutingLogExporter +]: + """Get or create shared OpenTelemetry logging infrastructure. + + Creates a single LoggerProvider with BatchLogRecordProcessor and + RoutingLogExporter that all log stores share. + + Returns: + Tuple of (LoggerProvider, RoutingLogExporter). + """ + global _shared_logger_provider, _routing_exporter + + if _shared_logger_provider is None: + with _infrastructure_lock: + if _shared_logger_provider is None: + logger.info( + "Initializing shared OTel logging infrastructure " + "with 1 background thread" + ) + + # Create routing exporter + _routing_exporter = RoutingLogExporter() + + # Create shared logger provider + _shared_logger_provider = LoggerProvider() + + # One background thread for all log stores + processor = BatchLogRecordProcessor( + _routing_exporter, + max_queue_size=4096, # Larger for shared use + schedule_delay_millis=1000, # Batch every 1 second + max_export_batch_size=512, # Export in batches of 512 + ) + _shared_logger_provider.add_log_record_processor(processor) + + return _shared_logger_provider, _routing_exporter + + +def shutdown_shared_infrastructure() -> None: + """Shutdown the shared OpenTelemetry infrastructure. + + This should be called on process shutdown to cleanly close all resources. + """ + global _shared_logger_provider, _routing_exporter + + if _shared_logger_provider: + logger.info("Shutting down shared OTel logging infrastructure") + try: + _shared_logger_provider.force_flush() + _shared_logger_provider.shutdown() + except Exception as e: + logger.warning(f"Error during shutdown: {e}") + + if _routing_exporter: + try: + _routing_exporter.shutdown() + except Exception as e: + logger.warning(f"Error shutting down routing exporter: {e}") + + _shared_logger_provider = None + _routing_exporter = None diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/step_logging.py index c43a4cb2983..3797e89eb0c 100644 --- a/src/zenml/logging/step_logging.py +++ b/src/zenml/logging/step_logging.py @@ -13,7 +13,6 @@ # permissions and limitations under the License. """ZenML logging handler.""" -import os import re from contextlib import nullcontext from contextvars import ContextVar @@ -42,11 +41,34 @@ LogsRequest, PipelineSnapshotResponse, ) -from zenml.utils.io_utils import sanitize_remote_path from zenml.utils.time_utils import utc_now if TYPE_CHECKING: - from zenml.artifact_stores import BaseArtifactStore + from zenml.log_stores.base_log_store import BaseLogStore + +# Active log store per thread +_active_log_store: ContextVar[Optional["BaseLogStore"]] = ContextVar( + "active_log_store", default=None +) + + +def set_active_log_store(log_store: Optional["BaseLogStore"]) -> None: + """Set active log store for current thread. + + Args: + log_store: Log store to activate, or None to deactivate. + """ + _active_log_store.set(log_store) + + +def get_active_log_store() -> Optional["BaseLogStore"]: + """Get the active log store for the current thread. + + Returns: + The active log store, or None if no log store is active. + """ + return _active_log_store.get() + logger = get_logger(__name__) @@ -55,7 +77,6 @@ ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") -LOGS_EXTENSION = ".log" PIPELINE_RUN_LOGS_FOLDER = "pipeline_runs" # Maximum number of log entries to return in a single request @@ -64,44 +85,6 @@ DEFAULT_MESSAGE_SIZE = 5 * 1024 -def prepare_logs_uri( - artifact_store: "BaseArtifactStore", - log_id: UUID, -) -> str: - """Generates and prepares a URI for the log file or folder for a step. - - Args: - artifact_store: The artifact store on which the artifact will be stored. - log_id: The ID of the logs entity - - Returns: - The URI of the log storage (file or folder). - """ - logs_base_uri = os.path.join(artifact_store.path, "logs") - - if not artifact_store.exists(logs_base_uri): - artifact_store.makedirs(logs_base_uri) - - if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - logs_uri = os.path.join(logs_base_uri, log_id) - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs directory {logs_uri} already exists! Removing old log directory..." - ) - artifact_store.rmtree(logs_uri) - - artifact_store.makedirs(logs_uri) - else: - logs_uri = os.path.join(logs_base_uri, f"{log_id}{LOGS_EXTENSION}") - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs file {logs_uri} already exists! Removing old log file..." - ) - artifact_store.remove(logs_uri) - - return sanitize_remote_path(logs_uri) - - class LogEntry(BaseModel): """A structured log entry with parsed information.""" @@ -186,7 +169,10 @@ def generate_log_request(self) -> "LogsRequest": Returns: The log request model. """ - from zenml.log_stores.default.default_log_store import DefaultLogStore + from zenml.log_stores.default.default_log_store import ( + DefaultLogStore, + prepare_logs_uri, + ) if isinstance(self.log_store, DefaultLogStore): log_id = uuid4() From c73340123a7afd1390364a5d39be7364d861de81 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 9 Nov 2025 21:35:02 +0100 Subject: [PATCH 14/81] new big checkpoint --- .../kubernetes_orchestrator_entrypoint.py | 2 +- src/zenml/log_stores/base_log_store.py | 29 +- .../log_stores/datadog/datadog_log_store.py | 12 +- .../default/artifact_store_exporter.py | 250 +++++--- .../log_stores/default/default_log_store.py | 592 +----------------- .../default/default_log_store_flavor.py | 12 +- src/zenml/log_stores/otel/otel_log_store.py | 173 ++--- .../otel/otel_provider.py} | 90 ++- src/zenml/log_stores/utils.py | 2 +- src/zenml/logger.py | 181 +++--- .../logging/{step_logging.py => logging.py} | 111 +++- .../orchestrators/local/local_orchestrator.py | 181 +++--- src/zenml/orchestrators/step_launcher.py | 2 +- src/zenml/orchestrators/step_runner.py | 2 +- src/zenml/pipelines/pipeline_definition.py | 2 +- .../zen_server/routers/runs_endpoints.py | 2 +- .../zen_server/routers/steps_endpoints.py | 2 +- src/zenml/zen_stores/sql_zen_store.py | 4 +- 18 files changed, 640 insertions(+), 1009 deletions(-) rename src/zenml/{logging/otel_logging_infrastructure.py => log_stores/otel/otel_provider.py} (76%) rename src/zenml/logging/{step_logging.py => logging.py} (71%) diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index bfba199e407..aa90ff64264 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -56,7 +56,7 @@ pod_template_manifest_from_pod, ) from zenml.logger import get_logger -from zenml.logging.step_logging import setup_orchestrator_logging +from zenml.logging.logging import setup_orchestrator_logging from zenml.models import ( PipelineRunResponse, PipelineRunUpdate, diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index 9fadf3907e0..74bf7df001a 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -16,14 +16,18 @@ import logging from abc import abstractmethod from datetime import datetime -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, cast +from typing import TYPE_CHECKING, List, Optional, Type, cast from zenml.enums import StackComponentType +from zenml.logging.logging import ( + DEFAULT_MESSAGE_SIZE, + MAX_ENTRIES_PER_REQUEST, + LogEntry, +) from zenml.stack import Flavor, StackComponent, StackComponentConfig if TYPE_CHECKING: - from zenml.logging.step_logging import LogEntry - from zenml.models import LogsRequest, LogsResponse + from zenml.models import LogsResponse class BaseLogStoreConfig(StackComponentConfig): @@ -47,17 +51,13 @@ def config(self) -> BaseLogStoreConfig: """ return cast(BaseLogStoreConfig, self._config) - # TODO: This should probably accept not just requests but also responses @abstractmethod - def activate(self, log_request: "LogsRequest") -> None: + def activate(self) -> None: """Activate the log store for log collection. This method is called when ZenML needs to start collecting and storing logs during pipeline or step execution. It should set up any necessary handlers, threads, or connections. - - Args: - log_request: The log request model. """ @abstractmethod @@ -69,10 +69,11 @@ def deactivate(self) -> None: any background threads or connections. """ + @abstractmethod def emit(self, record: logging.LogRecord) -> None: """Process a log record from the routing handler. - This method is called by the ZenML routing handler for each log + This method is called by the ZenML logging system for each log record that should be stored by this log store. Implementations should process the record according to their backend's requirements. @@ -81,12 +82,8 @@ def emit(self, record: logging.LogRecord) -> None: deactivate) without real-time processing to skip implementing this. Args: - record: The Python logging record to process. + record: The Python logging.LogRecord to process. """ - # Default: do nothing - # This is NOT abstract, so implementations can opt-in - pass - @abstractmethod def fetch( @@ -94,7 +91,8 @@ def fetch( logs_model: "LogsResponse", start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, - limit: int = 20000, + limit: int = MAX_ENTRIES_PER_REQUEST, + message_size: int = DEFAULT_MESSAGE_SIZE, ) -> List["LogEntry"]: """Fetch logs from the log store. @@ -113,6 +111,7 @@ def fetch( start_time: Filter logs after this time. end_time: Filter logs before this time. limit: Maximum number of log entries to return. + message_size: Maximum size of a single log message in bytes. Returns: List of log entries matching the query. diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 4571d8892ba..916c659430a 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -24,7 +24,7 @@ from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreConfig from zenml.log_stores.otel.otel_log_store import OtelLogStore from zenml.logger import get_logger -from zenml.logging.step_logging import LogEntry +from zenml.logging.logging import LogEntry from zenml.models import LogsResponse logger = get_logger(__name__) @@ -67,35 +67,27 @@ def export(self, batch: List[LogData]) -> Any: for log_data in batch: log_record = log_data.log_record - # Extract resource attributes resource_attrs = {} if log_record.resource: resource_attrs = dict(log_record.resource.attributes) - # Extract log attributes log_attrs = {} if log_record.attributes: log_attrs = dict(log_record.attributes) - # Combine attributes with additional tags all_attrs = {**resource_attrs, **log_attrs} - # Build Datadog log entry log_entry = { "message": str(log_record.body), } - # Add severity if available if log_record.severity_text: log_entry["status"] = log_record.severity_text.lower() - # Add timestamp if available (convert from nanoseconds to milliseconds) if log_record.timestamp: log_entry["timestamp"] = int(log_record.timestamp / 1_000_000) - # Add all attributes as tags if all_attrs: - # Convert dict to Datadog tags format: key:value tags = [f"{k}:{v}" for k, v in all_attrs.items()] log_entry["ddtags"] = ",".join(tags) @@ -170,6 +162,7 @@ def fetch( start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: int = 20000, + message_size: int = 5120, ) -> List["LogEntry"]: """Fetch logs from Datadog's API. @@ -182,6 +175,7 @@ def fetch( start_time: Filter logs after this time. end_time: Filter logs before this time. limit: Maximum number of log entries to return. + message_size: Maximum size of a single log message in bytes. Returns: List of log entries from Datadog. diff --git a/src/zenml/log_stores/default/artifact_store_exporter.py b/src/zenml/log_stores/default/artifact_store_exporter.py index 5a97bf84cd7..5126ae89223 100644 --- a/src/zenml/log_stores/default/artifact_store_exporter.py +++ b/src/zenml/log_stores/default/artifact_store_exporter.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""OpenTelemetry exporter that writes logs to ZenML artifact store.""" +"""OpenTelemetry exporter that writes logs to ZenML artifact store. + +This implementation reuses the proven logic from the original step_logging.py +implementation, including message chunking and JSON line formatting. +""" import time -from datetime import datetime -from typing import TYPE_CHECKING, Optional, Sequence +from typing import TYPE_CHECKING, List, Sequence +from uuid import uuid4 from opentelemetry.sdk._logs.export import LogExporter, LogExportResult @@ -24,7 +28,13 @@ from zenml.artifact_stores import BaseArtifactStore +from zenml.enums import LoggingLevels + +# Import from default_log_store to avoid duplication +from zenml.log_stores.default.default_log_store import remove_ansi_escape_codes from zenml.logger import get_logger +from zenml.logging.logging import DEFAULT_MESSAGE_SIZE, LogEntry +from zenml.utils.time_utils import utc_now logger = get_logger(__name__) @@ -32,9 +42,8 @@ class ArtifactStoreExporter(LogExporter): """OpenTelemetry exporter that writes logs to ZenML artifact store. - Replaces the custom LogsStorage implementation with a standard - OpenTelemetry exporter. Logs are batched by BatchLogRecordProcessor - and written to the artifact store. + This exporter adapts OpenTelemetry log records to the ZenML LogEntry format + and writes them as JSON lines to the artifact store. """ def __init__( @@ -50,12 +59,14 @@ def __init__( """ self.logs_uri = logs_uri self.artifact_store = artifact_store - self.log_buffer: list[str] = [] self.file_counter = 0 def export(self, batch: Sequence["LogData"]) -> LogExportResult: """Export a batch of logs to the artifact store. + Converts OTEL log records to ZenML LogEntry format with proper + message chunking and writes them as JSON lines. + Args: batch: Sequence of LogData to export. @@ -66,20 +77,15 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: return LogExportResult.SUCCESS try: - # Format logs log_lines = [] for log_data in batch: log_record = log_data.log_record - # Format as ZenML log entry - log_line = self._format_log_entry( - message=str(log_record.body) if log_record.body else "", - level=log_record.severity_text, - timestamp_ns=log_record.timestamp, - ) - log_lines.append(log_line) + entries = self._otel_record_to_log_entries(log_record) + for entry in entries: + json_line = entry.model_dump_json(exclude_none=True) + log_lines.append(json_line) - # Write to artifact store if log_lines: self._write_to_artifact_store(log_lines) @@ -89,79 +95,163 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: logger.error(f"Failed to export logs to artifact store: {e}") return LogExportResult.FAILURE - def _format_log_entry( - self, - message: str, - level: Optional[str], - timestamp_ns: Optional[int], - ) -> str: - """Format a log entry in ZenML format. + def _otel_record_to_log_entries( + self, log_record: "LogData" + ) -> List[LogEntry]: + """Convert an OTEL log record to one or more ZenML LogEntry objects. + + Handles message chunking for large messages and extracts all relevant + metadata from the OTEL record. Args: - message: The log message. - level: The log level (DEBUG, INFO, etc.). - timestamp_ns: Timestamp in nanoseconds. + log_record: The OpenTelemetry log record. Returns: - Formatted log line. + List of LogEntry objects (multiple if message was chunked). """ - # Convert timestamp - if timestamp_ns: - timestamp = datetime.fromtimestamp(timestamp_ns / 1e9) - timestamp_str = timestamp.strftime("%Y-%m-%d %H:%M:%S,%f")[:-3] - else: - timestamp_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[ - :-3 + message = str(log_record.body) if log_record.body else "" + message = remove_ansi_escape_codes(message).rstrip() + + level = self._map_severity_to_level(log_record.severity_text) + + + name = "unknown" + module = None + filename = None + lineno = None + + if log_record.attributes: + attrs = dict(log_record.attributes) + filename = attrs.get("code.filepath", None) + lineno = attrs.get("code.lineno", None) + module = attrs.get("code.function", None) + + message_bytes = message.encode("utf-8") + if len(message_bytes) <= DEFAULT_MESSAGE_SIZE: + return [ + LogEntry( + message=message, + name=name, + level=level, + timestamp=utc_now(tz_aware=True), + module=module, + filename=filename, + lineno=lineno, + ) ] + else: + chunks = self._split_to_chunks(message) + entry_id = uuid4() + entries = [] + + for i, chunk in enumerate(chunks): + entries.append( + LogEntry( + message=chunk, + name=name, + level=level, + timestamp=utc_now(tz_aware=True), + module=module, + filename=filename, + lineno=lineno, + chunk_index=i, + total_chunks=len(chunks), + id=entry_id, + ) + ) - # Map OTel severity to ZenML level - zenml_level = self._map_severity_to_level(level) - - # Format: timestamp|level|message - return f"{timestamp_str}|{zenml_level}|{message}" + return entries - def _map_severity_to_level(self, severity: Optional[str]) -> str: - """Map OpenTelemetry severity to ZenML log level. + def _map_severity_to_level(self, severity_text: str) -> LoggingLevels: + """Map OTEL severity text to ZenML LoggingLevels enum. Args: - severity: OTel severity text. + severity_text: The OTEL severity text. Returns: - ZenML log level string. + The corresponding LoggingLevels enum value. """ - if not severity: - return "INFO" - - severity_upper = severity.upper() - if "DEBUG" in severity_upper or "TRACE" in severity_upper: - return "DEBUG" - elif "INFO" in severity_upper: - return "INFO" - elif "WARN" in severity_upper: - return "WARNING" - elif "ERROR" in severity_upper: - return "ERROR" - elif ( - "CRITICAL" in severity_upper - or "FATAL" in severity_upper - or "EMERGENCY" in severity_upper - ): - return "CRITICAL" + if not severity_text: + return LoggingLevels.INFO + + severity_upper = severity_text.upper() + + if severity_upper in ["DEBUG", "TRACE"]: + return LoggingLevels.DEBUG + elif severity_upper in ["INFO", "INFORMATION"]: + return LoggingLevels.INFO + elif severity_upper in ["WARN", "WARNING"]: + return LoggingLevels.WARN + elif severity_upper == "ERROR": + return LoggingLevels.ERROR + elif severity_upper in ["CRITICAL", "FATAL", "EMERGENCY"]: + return LoggingLevels.CRITICAL else: - return "INFO" + return LoggingLevels.INFO - def _write_to_artifact_store(self, log_lines: list[str]) -> None: + def _split_to_chunks(self, message: str) -> List[str]: + """Split a large message into chunks. + + Properly handles UTF-8 boundaries to avoid breaking multi-byte characters. + This is the same logic from the original step_logging.py implementation. + + Args: + message: The message to split. + + Returns: + A list of message chunks. + """ + message_bytes = message.encode("utf-8") + chunks = [] + start = 0 + + while start < len(message_bytes): + # Calculate the end position for this chunk + end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) + + # Try to decode the chunk, backing up if we hit a UTF-8 boundary issue + while end > start: + chunk_bytes = message_bytes[start:end] + try: + chunk_text = chunk_bytes.decode("utf-8") + chunks.append(chunk_text) + break + except UnicodeDecodeError: + # If we can't decode, try a smaller chunk + end -= 1 + else: + # If we can't decode anything, use replacement characters + end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) + chunks.append( + message_bytes[start:end].decode("utf-8", errors="replace") + ) + + start = end + + return chunks + + def _write_to_artifact_store(self, log_lines: List[str]) -> None: """Write log lines to the artifact store. + Generates a unique timestamped filename for each batch and writes + the log lines as newline-delimited JSON. + Args: - log_lines: List of formatted log lines. + log_lines: List of JSON-serialized log entries. """ - # Create unique file name with timestamp + # Generate unique filename with timestamp and counter + # This matches the pattern from the original implementation timestamp = int(time.time() * 1000) self.file_counter += 1 - file_uri = f"{self.logs_uri}.{timestamp}.{self.file_counter}" - # Join lines and write + # Use the logs_uri as the base - append timestamp and counter + base_uri = self.logs_uri + if base_uri.endswith(".log"): + base_uri = base_uri[:-4] + + file_uri = f"{base_uri}_{timestamp}_{self.file_counter}.jsonl" + + # Join lines and write (one JSON object per line) content = "\n".join(log_lines) + "\n" try: @@ -175,13 +265,17 @@ def _write_to_artifact_store(self, log_lines: list[str]) -> None: raise def shutdown(self) -> None: - """Shutdown the exporter and flush any remaining logs.""" - if self.log_buffer: + """Shutdown the exporter and cleanup artifact store resources. + + This is important to prevent memory leaks by cleaning up any + cached connections or file handles held by the artifact store. + """ + if hasattr(self, "artifact_store") and self.artifact_store: try: - self._write_to_artifact_store(self.log_buffer) - self.log_buffer.clear() + self.artifact_store.cleanup() + logger.debug("Artifact store cleanup completed") except Exception as e: - logger.warning(f"Error during shutdown flush: {e}") + logger.warning(f"Error during artifact store cleanup: {e}") def force_flush(self, timeout_millis: int = 30000) -> bool: """Force flush any buffered logs. @@ -190,13 +284,7 @@ def force_flush(self, timeout_millis: int = 30000) -> bool: timeout_millis: Timeout in milliseconds. Returns: - True if successful. + True if successful (always true - no buffering at this level). """ - try: - if self.log_buffer: - self._write_to_artifact_store(self.log_buffer) - self.log_buffer.clear() - return True - except Exception as e: - logger.warning(f"Force flush failed: {e}") - return False + # No-op - OTEL BatchLogRecordProcessor handles all flushing + return True diff --git a/src/zenml/log_stores/default/default_log_store.py b/src/zenml/log_stores/default/default_log_store.py index 91c4656b76b..832a2645b98 100644 --- a/src/zenml/log_stores/default/default_log_store.py +++ b/src/zenml/log_stores/default/default_log_store.py @@ -13,68 +13,41 @@ # permissions and limitations under the License. """Default log store implementation.""" -import asyncio -import logging import os -import queue import re -import threading -import time from datetime import datetime from typing import ( TYPE_CHECKING, - Any, Iterator, List, Optional, Union, cast, ) -from uuid import UUID, uuid4 +from uuid import UUID from zenml.artifact_stores import BaseArtifactStore from zenml.artifacts.utils import _load_artifact_store from zenml.client import Client -from zenml.constants import ( - LOGS_MERGE_INTERVAL_SECONDS, - LOGS_STORAGE_MAX_QUEUE_SIZE, - LOGS_STORAGE_QUEUE_TIMEOUT, - LOGS_WRITE_INTERVAL_SECONDS, -) from zenml.enums import LoggingLevels from zenml.exceptions import DoesNotExistException -from zenml.log_stores.base_log_store import BaseLogStore from zenml.log_stores.default.default_log_store_flavor import ( DefaultLogStoreConfig, ) -from zenml.logger import ( - get_logger, - get_storage_log_level, - logging_handlers, -) -from zenml.logging.step_logging import ( - DEFAULT_MESSAGE_SIZE, +from zenml.log_stores.otel.otel_log_store import OtelLogStore +from zenml.logger import get_logger +from zenml.logging.logging import ( MAX_ENTRIES_PER_REQUEST, LogEntry, ) -from zenml.models import ( - LogsRequest, - LogsResponse, -) +from zenml.models import LogsResponse from zenml.utils.io_utils import sanitize_remote_path -from zenml.utils.time_utils import utc_now from zenml.zen_stores.base_zen_store import BaseZenStore if TYPE_CHECKING: + from opentelemetry.sdk._logs.export import LogExporter + from zenml.artifact_stores import BaseArtifactStore - from zenml.log_stores.default.artifact_store_exporter import ( - ArtifactStoreExporter, - ) - from zenml.logging.step_logging import ( - ArtifactStoreHandler, - LogEntry, - PipelineLogsStorage, - ) ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") @@ -256,453 +229,14 @@ def parse_log_entry(log_line: str) -> Optional[LogEntry]: ) -class LogsStorage: - """Helper class which buffers and stores logs to a given URI using a background thread.""" - - def __init__( - self, - logs_uri: str, - artifact_store: "BaseArtifactStore", - max_queue_size: int = LOGS_STORAGE_MAX_QUEUE_SIZE, - queue_timeout: int = LOGS_STORAGE_QUEUE_TIMEOUT, - write_interval: int = LOGS_WRITE_INTERVAL_SECONDS, - merge_files_interval: int = LOGS_MERGE_INTERVAL_SECONDS, - ) -> None: - """Initialization. - - Args: - logs_uri: the URI of the log file or folder. - artifact_store: Artifact Store from the current step context - max_queue_size: maximum number of individual messages to queue. - queue_timeout: timeout in seconds for putting items in queue when full. - - Positive value: Wait N seconds, then drop logs if queue still full - - Negative value: Block indefinitely until queue has space (never drop logs) - write_interval: the amount of seconds before the created files - get written to the artifact store. - merge_files_interval: the amount of seconds before the created files - get merged into a single file. - """ - # Parameters - self.logs_uri = logs_uri - self.max_queue_size = max_queue_size - self.queue_timeout = queue_timeout - self.write_interval = write_interval - self.merge_files_interval = merge_files_interval - - # State - self.artifact_store = artifact_store - - # Immutable filesystems state - self.last_merge_time = time.time() - - # Queue and log storage thread for async processing - self.log_queue: queue.Queue[str] = queue.Queue(maxsize=max_queue_size) - self.log_storage_thread: Optional[threading.Thread] = None - self.shutdown_event = threading.Event() - self.merge_event = threading.Event() - - # Start the log storage thread - self._start_log_storage_thread() - - def _start_log_storage_thread(self) -> None: - """Start the log storage thread for processing log queue.""" - if ( - self.log_storage_thread is None - or not self.log_storage_thread.is_alive() - ): - self.log_storage_thread = threading.Thread( - target=self._log_storage_worker, - name="LogsStorage-Worker", - ) - self.log_storage_thread.start() - - def _process_log_queue(self, force_merge: bool = False) -> None: - """Write and merge logs to the artifact store using time-based batching. - - Args: - force_merge: Whether to force merge the logs. - """ - try: - messages = [] - - # Get first message (blocking with timeout) - try: - first_message = self.log_queue.get(timeout=1) - messages.append(first_message) - except queue.Empty: - return - - # Get any remaining messages without waiting (drain quickly) - while True: - try: - additional_message = self.log_queue.get_nowait() - messages.append(additional_message) - except queue.Empty: - break - - # Write the messages to the artifact store - if messages: - self.write_buffer(messages) - - # Merge the log files if needed - if ( - self._is_merge_needed - or self.merge_event.is_set() - or force_merge - ): - self.merge_event.clear() - - self.merge_log_files(merge_all_files=force_merge) - - except Exception as e: - logger.error("Error in log storage thread: %s", e) - finally: - for _ in messages: - self.log_queue.task_done() - - # Wait for the next write interval or until shutdown is requested - self.shutdown_event.wait(timeout=self.write_interval) - - def _log_storage_worker(self) -> None: - """Log storage thread worker that processes the log queue.""" - # Process the log queue until shutdown is requested - while not self.shutdown_event.is_set(): - self._process_log_queue() - - # Shutdown requested - drain remaining queue items and merge log files - self._process_log_queue(force_merge=True) - - def _shutdown_log_storage_thread(self, timeout: int = 5) -> None: - """Shutdown the log storage thread gracefully. - - Args: - timeout: Maximum time to wait for thread shutdown. - """ - if self.log_storage_thread and self.log_storage_thread.is_alive(): - # Then signal the worker to begin graceful shutdown - self.shutdown_event.set() - - # Wait for thread to finish (it will drain the queue automatically) - self.log_storage_thread.join(timeout=timeout) - - def write(self, text: str) -> None: - """Main write method that sends individual messages directly to queue. - - Args: - text: the incoming string. - """ - # Skip empty lines - if text == "\n": - return - - # If the current thread is the log storage thread, do nothing - # to prevent recursion when the storage thread itself generates logs - if ( - self.log_storage_thread - and threading.current_thread() == self.log_storage_thread - ): - return - - # If the current thread is the fsspec IO thread, do nothing - if self._is_fsspec_io_thread: - return - - try: - # Send individual message directly to queue - if not self.shutdown_event.is_set(): - try: - if self.queue_timeout < 0: - # Negative timeout = block indefinitely until queue has space - # Guarantees no log loss but may hang application - self.log_queue.put(text) - else: - # Positive timeout = wait specified time then drop logs - # Prevents application hanging but may lose logs - self.log_queue.put(text, timeout=self.queue_timeout) - except queue.Full: - # This only happens with positive timeout - # Queue is full - just skip this message to avoid blocking - # Better to drop logs than hang the application - pass - - except Exception: - # Silently ignore errors to prevent recursion - pass - - @property - def _is_merge_needed(self) -> bool: - """Checks whether the log files need to be merged. - - Returns: - whether the log files need to be merged. - """ - return ( - self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM - and time.time() - self.last_merge_time > self.merge_files_interval - ) - - @property - def _is_fsspec_io_thread(self) -> bool: - """Checks if the current thread is the fsspec IO thread. - - Returns: - whether the current thread is the fsspec IO thread. - """ - # Most artifact stores are based on fsspec, which converts between - # sync and async operations by using a separate AIO thread. - # It may happen that the fsspec call itself will log something, - # which will trigger this method, which may then use fsspec again, - # causing a "Calling sync() from within a running loop" error, because - # the fsspec library does not expect sync calls being made as a result - # of a logging call made by itself. - # To avoid this, we simply check if we're running in the fsspec AIO - # thread and skip the save if that's the case. - try: - return ( - asyncio.events.get_running_loop() is not None - and threading.current_thread().name == "fsspecIO" - ) - except RuntimeError: - # No running loop - return False - - def _get_timestamped_filename(self, suffix: str = "") -> str: - """Returns a timestamped filename. - - Args: - suffix: optional suffix for the file name - - Returns: - The timestamped filename. - """ - return f"{time.time()}{suffix}{LOGS_EXTENSION}" - - def write_buffer(self, buffer_to_write: List[str]) -> None: - """Write the given buffer to file. This runs in the log storage thread. - - Args: - buffer_to_write: The buffer contents to write to file. - """ - if not buffer_to_write: - return - - try: - # If the artifact store is immutable, write the buffer to a new file - if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - _logs_uri = self._get_timestamped_filename() - with self.artifact_store.open( - os.path.join( - self.logs_uri, - _logs_uri, - ), - "w", - ) as file: - for message in buffer_to_write: - file.write(f"{message}\n") - - # If the artifact store is mutable, append the buffer to the existing file - else: - with self.artifact_store.open(self.logs_uri, "a") as file: - for message in buffer_to_write: - file.write(f"{message}\n") - self.artifact_store._remove_previous_file_versions( - self.logs_uri - ) - - except Exception as e: - logger.error("Error in log storage thread: %s", e) - - def merge_log_files(self, merge_all_files: bool = False) -> None: - """Merges all log files into one in the given URI. - - Called on the logging context exit. - - Args: - merge_all_files: whether to merge all files or only raw files - """ - from zenml.artifacts.utils import ( - _load_file_from_artifact_store, - ) - - # If the artifact store is immutable, merge the log files - if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - merged_file_suffix = "_merged" - files_ = self.artifact_store.listdir(self.logs_uri) - if not merge_all_files: - # already merged files will not be merged again - files_ = [ - f for f in files_ if merged_file_suffix not in str(f) - ] - file_name_ = self._get_timestamped_filename( - suffix=merged_file_suffix - ) - if len(files_) > 1: - files_.sort() - logger.debug("Log files count: %s", len(files_)) - - missing_files = set() - # dump all logs to a local file first - with self.artifact_store.open( - os.path.join(self.logs_uri, file_name_), "w" - ) as merged_file: - for file in files_: - try: - merged_file.write( - str( - _load_file_from_artifact_store( - os.path.join(self.logs_uri, str(file)), - artifact_store=self.artifact_store, - mode="r", - ) - ) - ) - except DoesNotExistException: - missing_files.add(file) - - # clean up left over files - for file in files_: - if file not in missing_files: - self.artifact_store.remove( - os.path.join(self.logs_uri, str(file)) - ) - - # Update the last merge time - self.last_merge_time = time.time() - - def send_merge_event(self) -> None: - """Send a merge event to the log storage thread.""" - self.merge_event.set() - - -class ArtifactStoreHandler(logging.Handler): - """Handler that writes log messages to artifact store storage.""" - - def __init__(self, storage: "PipelineLogsStorage"): - """Initialize the handler with a storage instance. - - Args: - storage: The PipelineLogsStorage instance to write to. - """ - super().__init__() - self.storage = storage - - # Get storage log level from environment - self.setLevel(get_storage_log_level().value) - - def emit(self, record: logging.LogRecord) -> None: - """Emit a log record to the storage. - - Args: - record: The log record to emit. - """ - try: - # Get level enum - level = LoggingLevels.__members__.get(record.levelname.upper()) - - # Get the message - message = self.format(record) - message = remove_ansi_escape_codes(message).rstrip() - - # Check if message needs to be chunked - message_bytes = message.encode("utf-8") - if len(message_bytes) <= DEFAULT_MESSAGE_SIZE: - # Message is small enough, emit as-is - log_record = LogEntry.model_construct( - message=message, - name=record.name, - level=level, - timestamp=utc_now(tz_aware=True), - module=record.module, - filename=record.filename, - lineno=record.lineno, - ) - json_line = log_record.model_dump_json(exclude_none=True) - self.storage.write(json_line) - else: - # Message is too large, split into chunks and emit each one - chunks = self._split_to_chunks(message) - entry_id = uuid4() - for i, chunk in enumerate(chunks): - log_record = LogEntry.model_construct( - message=chunk, - name=record.name, - level=level, - module=record.module, - filename=record.filename, - lineno=record.lineno, - timestamp=utc_now(tz_aware=True), - chunk_index=i, - total_chunks=len(chunks), - id=entry_id, - ) - - json_line = log_record.model_dump_json(exclude_none=True) - self.storage.write(json_line) - except Exception: - pass - - def _split_to_chunks(self, message: str) -> List[str]: - """Split a large message into chunks. - - Args: - message: The message to split. - - Returns: - A list of message chunks. - """ - # Calculate how many chunks we need - message_bytes = message.encode("utf-8") - - # Split the message into chunks, handling UTF-8 boundaries - chunks = [] - start = 0 - - while start < len(message_bytes): - # Calculate the end position for this chunk - end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) - - # Try to decode the chunk, backing up if we hit a UTF-8 boundary issue - while end > start: - chunk_bytes = message_bytes[start:end] - try: - chunk_text = chunk_bytes.decode("utf-8") - chunks.append(chunk_text) - break - except UnicodeDecodeError: - # If we can't decode, try a smaller chunk - end -= 1 - else: - # If we can't decode anything, use replacement characters - end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) - chunks.append( - message_bytes[start:end].decode("utf-8", errors="replace") - ) - - start = end - - return chunks - - -class DefaultLogStore(BaseLogStore): +class DefaultLogStore(OtelLogStore): """Log store that saves logs to the artifact store. - This implementation uses OpenTelemetry infrastructure to write logs - to the artifact store. Uses shared BatchLogRecordProcessor with - thread pool for efficient parallel exports. + This implementation extends OtelLogStore and uses the ArtifactStoreExporter + to write logs to the artifact store. Inherits all OTEL infrastructure + including shared BatchLogRecordProcessor and routing. """ - def __init__(self, *args: Any, **kwargs: Any) -> None: - """Initialize the default log store. - - Args: - *args: Positional arguments for the base class. - **kwargs: Keyword arguments for the base class. - """ - super().__init__(*args, **kwargs) - self._exporter: Optional["ArtifactStoreExporter"] = None - self._handler: Optional[logging.Handler] = None - self._log_id: Optional[str] = None - @property def config(self) -> DefaultLogStoreConfig: """Returns the configuration of the default log store. @@ -712,102 +246,28 @@ def config(self) -> DefaultLogStoreConfig: """ return cast(DefaultLogStoreConfig, self._config) - def activate(self, log_request: "LogsRequest") -> None: - """Activate log collection to the artifact store. + def get_exporter(self) -> "LogExporter": + """Get the artifact store exporter for this log store. - Args: - log_request: The log request model. + Returns: + The ArtifactStoreExporter instance. """ - from opentelemetry.sdk._logs import LoggingHandler - from opentelemetry.sdk.resources import Resource - from zenml.log_stores.default.artifact_store_exporter import ( ArtifactStoreExporter, ) - from zenml.logging.otel_logging_infrastructure import ( - get_shared_otel_infrastructure, - ) - from zenml.logging.routing_handler import set_active_log_store - - # Get shared OTel infrastructure - logger_provider, routing_exporter = get_shared_otel_infrastructure() - - # Create artifact store exporter for this log store - self._exporter = ArtifactStoreExporter( - logs_uri=log_request.uri, - artifact_store=Client().active_stack.artifact_store, - ) - - # Register exporter with routing exporter - self._log_id = str(log_request.id) - routing_exporter.register_exporter(self._log_id, self._exporter) - - # Create resource with log_id and LoggerProvider - from opentelemetry.sdk._logs import LoggerProvider - - resource = Resource.create({"zenml.log_id": self._log_id}) - self._logger_provider_with_resource = LoggerProvider(resource=resource) + from zenml.logging.logging import get_active_log_model - # Share the same processor (routing exporter) from the global provider - for processor in ( - logger_provider._multi_log_record_processor._log_record_processors - ): - self._logger_provider_with_resource.add_log_record_processor( - processor + log_model = get_active_log_model() + if not log_model: + raise RuntimeError( + "get_exporter() called outside of an active logging context. " + "This should not happen." ) - self._handler = LoggingHandler( - level=get_storage_log_level().value, - logger_provider=self._logger_provider_with_resource, - ) - - # Register this log store for routing - set_active_log_store(self) - - # Add to context variables for print capture - logging_handlers.add(self._handler) - - def emit(self, record: logging.LogRecord) -> None: - """Process a log record by sending to artifact store. - - Args: - record: The log record to process. - """ - if self._handler: - try: - self._handler.emit(record) - except Exception: - # Don't let logging errors break execution - pass - - def deactivate(self) -> None: - """Deactivate log collection and flush remaining logs.""" - if not self._handler: - return - - # Unregister from the current thread's context - from zenml.logging.otel_logging_infrastructure import ( - get_shared_otel_infrastructure, + return ArtifactStoreExporter( + logs_uri=log_model.uri, + artifact_store=Client().active_stack.artifact_store, ) - from zenml.logging.routing_handler import set_active_log_store - - set_active_log_store(None) - - # Remove from context variables - logging_handlers.remove(self._handler) - - # Unregister exporter from routing - if self._log_id and self._exporter: - _, routing_exporter = get_shared_otel_infrastructure() - routing_exporter.unregister_exporter(self._log_id) - - # Flush exporter - try: - self._exporter.force_flush() - except Exception as e: - logger.warning(f"Error flushing exporter: {e}") - - logger.debug("DefaultLogStore deactivated") def fetch( self, @@ -815,6 +275,7 @@ def fetch( start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: int = 20000, + message_size: int = 5120, ) -> List["LogEntry"]: """Fetch logs from the artifact store. @@ -823,6 +284,7 @@ def fetch( start_time: Filter logs after this time. end_time: Filter logs before this time. limit: Maximum number of log entries to return. + message_size: Maximum size of a single log message in bytes. Returns: List of log entries from the artifact store. @@ -830,8 +292,6 @@ def fetch( Raises: ValueError: If logs_model.uri is not provided. """ - from zenml.logging.step_logging import fetch_log_records - if not logs_model.uri: raise ValueError( "logs_model.uri is required for DefaultLogStore.fetch()" diff --git a/src/zenml/log_stores/default/default_log_store_flavor.py b/src/zenml/log_stores/default/default_log_store_flavor.py index c9a8850df70..40f0a49676c 100644 --- a/src/zenml/log_stores/default/default_log_store_flavor.py +++ b/src/zenml/log_stores/default/default_log_store_flavor.py @@ -16,18 +16,20 @@ from typing import TYPE_CHECKING, Type from zenml.enums import StackComponentType -from zenml.log_stores.base_log_store import BaseLogStoreConfig +from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.stack.flavor import Flavor if TYPE_CHECKING: from zenml.log_stores.base_log_store import BaseLogStore -class DefaultLogStoreConfig(BaseLogStoreConfig): +class DefaultLogStoreConfig(OtelLogStoreConfig): """Configuration for the default log store. - This log store saves logs to the artifact store, which is the default - and backward-compatible approach. + This log store saves logs to the artifact store using OTEL infrastructure, + which is the default and backward-compatible approach. + + Inherits OTEL configuration like service_name, batch sizes, etc. """ @@ -81,7 +83,7 @@ def type(self) -> StackComponentType: return StackComponentType.LOG_STORE @property - def config_class(self) -> Type[BaseLogStoreConfig]: + def config_class(self) -> Type[DefaultLogStoreConfig]: """Returns `DefaultLogStoreConfig` config class. Returns: diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 82bbbbcfe18..ca26c5c59b3 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -18,20 +18,21 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, List, Optional, cast -from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry._logs.severity import SeverityNumber +from opentelemetry.sdk._logs import LoggerProvider +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource +from zenml import __version__ from zenml.log_stores.base_log_store import BaseLogStore from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig -from zenml.logger import get_logger, get_storage_log_level, logging_handlers -from zenml.models import LogsRequest +from zenml.logger import get_logger +from zenml.models import LogsResponse if TYPE_CHECKING: - from opentelemetry.sdk._logs import LoggerProvider from opentelemetry.sdk._logs.export import LogExporter - from zenml.logging.step_logging import LogEntry - from zenml.models import LogsResponse + from zenml.logging.logging import LogEntry logger = get_logger(__name__) @@ -39,12 +40,12 @@ class OtelLogStore(BaseLogStore): """Log store that exports logs using OpenTelemetry. - This implementation uses the OpenTelemetry SDK to collect and export logs - to various backends. It uses a BatchLogRecordProcessor for efficient - background processing. + Each instance creates its own BatchLogRecordProcessor and background thread. + This is simpler than shared infrastructure but means more threads when + multiple log stores are active simultaneously. Subclasses should implement `get_exporter()` to provide the specific - log exporter for their backend (e.g., console, OTLP, Datadog). + log exporter for their backend (e.g., ArtifactStoreExporter, DatadogLogExporter). """ def __init__(self, *args: Any, **kwargs: Any) -> None: @@ -55,10 +56,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: **kwargs: Keyword arguments for the base class. """ super().__init__(*args, **kwargs) - self._logger_provider_with_resource: Optional["LoggerProvider"] = None - self._handler: Optional[logging.Handler] = None + + self._resource: Optional["Resource"] = None self._exporter: Optional["LogExporter"] = None - self._log_id: Optional[str] = None + self._provider: Optional["LoggerProvider"] = None + self._processor: Optional["BatchLogRecordProcessor"] = None @property def config(self) -> OtelLogStoreConfig: @@ -74,66 +76,36 @@ def get_exporter(self) -> "LogExporter": """Get the log exporter for this log store. Subclasses must implement this method to provide the appropriate - exporter for their backend (e.g., ConsoleLogExporter, OTLPLogExporter). + exporter for their backend. Returns: The log exporter instance. """ - def activate(self, log_request: "LogsRequest") -> None: - """Activate log collection with OpenTelemetry. - - Args: - log_request: The log request model. - """ - from zenml.logging.otel_logging_infrastructure import ( - get_shared_otel_infrastructure, - ) - from zenml.logging.routing_handler import set_active_log_store + def activate(self) -> None: + """Activate log collection with OpenTelemetry.""" + from zenml.logging.logging import get_active_log_model - # Get shared OTel infrastructure - logger_provider, routing_exporter = get_shared_otel_infrastructure() + log_model = get_active_log_model() + if not log_model: + raise RuntimeError( + "activate() called outside of an active logging context. " + "This should not happen." + ) - # Get exporter for this log store self._exporter = self.get_exporter() + self._processor = BatchLogRecordProcessor(self._exporter) - # Register exporter with routing exporter - self._log_id = str(log_request.id) - routing_exporter.register_exporter(self._log_id, self._exporter) - - # Create resource with log_id and service info - otel_resource = Resource.create( + self._resource = Resource.create( { "service.name": self.config.service_name, - "service.version": "0.91.0", # TODO: Fetch this - "zenml.log_id": self._log_id, + "service.version": __version__, + "zenml.log_id": str(log_model.id), } ) - # Create logger provider with this resource - self._logger_provider_with_resource = LoggerProvider( - resource=otel_resource - ) - - # Share the same processor (routing exporter) from the global provider - for processor in ( - logger_provider._multi_log_record_processor._log_record_processors - ): - self._logger_provider_with_resource.add_log_record_processor( - processor - ) - - # Create handler - self._handler = LoggingHandler( - level=get_storage_log_level().value, - logger_provider=self._logger_provider_with_resource, - ) - - # Register this log store for routing - set_active_log_store(self) - - # Add to context variables for print capture - logging_handlers.add(self._handler) + self._provider = LoggerProvider(resource=self._resource) + self._provider.add_log_record_processor(self._processor) def emit(self, record: logging.LogRecord) -> None: """Process a log record by sending to OpenTelemetry. @@ -141,39 +113,70 @@ def emit(self, record: logging.LogRecord) -> None: Args: record: The log record to process. """ - if self._handler: - try: - self._handler.emit(record) - except Exception: - # Don't let logging errors break execution - pass - - def deactivate(self) -> None: - """Deactivate log collection and flush remaining logs.""" - if not self._handler: + if not self._provider: return - # Unregister from the current thread's context - from zenml.logging.otel_logging_infrastructure import ( - get_shared_otel_infrastructure, - ) - from zenml.logging.routing_handler import set_active_log_store + try: + otel_logger = self._provider.get_logger( + record.name or "unknown", + schema_url=None, + ) + otel_logger.emit( + timestamp=int(record.created * 1e9), + observed_timestamp=int(record.created * 1e9), + severity_number=self._get_severity_number(record.levelno), + severity_text=record.levelname, + body=record.getMessage(), + attributes={ + "code.filepath": record.pathname, + "code.lineno": record.lineno, + "code.function": record.funcName, + }, + ) + + except Exception: + pass - set_active_log_store(None) + def _get_severity_number(self, levelno: int) -> int: + """Map Python log level to OTEL severity number. - # Remove from context variables - logging_handlers.remove(self._handler) + Args: + levelno: Python logging level number. - # Unregister exporter from routing - if self._log_id and self._exporter: - _, routing_exporter = get_shared_otel_infrastructure() - routing_exporter.unregister_exporter(self._log_id) + Returns: + OTEL severity number. + """ + if levelno >= logging.CRITICAL: + return SeverityNumber.FATAL.value + elif levelno >= logging.ERROR: + return SeverityNumber.ERROR.value + elif levelno >= logging.WARNING: + return SeverityNumber.WARN.value + elif levelno >= logging.INFO: + return SeverityNumber.INFO.value + elif levelno >= logging.DEBUG: + return SeverityNumber.DEBUG.value + else: + return SeverityNumber.UNSPECIFIED.value + + def deactivate(self) -> None: + """Deactivate log collection and shut down the processor. + + Flushes any pending logs and shuts down the processor's background thread. + """ + if self._processor: + try: + # Force flush any pending logs + self._processor.force_flush(timeout_millis=5000) + logger.debug("Flushed pending logs") + except Exception as e: + logger.warning(f"Error flushing logs: {e}") - # Flush exporter try: - self._exporter.force_flush() + self._processor.shutdown() + logger.debug("Shut down log processor and background thread") except Exception as e: - logger.warning(f"Error flushing exporter: {e}") + logger.warning(f"Error shutting down processor: {e}") logger.debug("OtelLogStore deactivated") @@ -184,6 +187,7 @@ def fetch( start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: int = 20000, + message_size: int = 5120, ) -> List["LogEntry"]: """Fetch logs from the OpenTelemetry backend. @@ -196,6 +200,7 @@ def fetch( start_time: Filter logs after this time. end_time: Filter logs before this time. limit: Maximum number of log entries to return. + message_size: Maximum size of a single log message in bytes. Returns: List of log entries from the backend. diff --git a/src/zenml/logging/otel_logging_infrastructure.py b/src/zenml/log_stores/otel/otel_provider.py similarity index 76% rename from src/zenml/logging/otel_logging_infrastructure.py rename to src/zenml/log_stores/otel/otel_provider.py index 26d90442396..7550a821403 100644 --- a/src/zenml/logging/otel_logging_infrastructure.py +++ b/src/zenml/log_stores/otel/otel_provider.py @@ -11,19 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Shared OpenTelemetry logging infrastructure for all log stores. - -Provides a unified backend using a single BatchLogRecordProcessor. -All log stores share this infrastructure, routing logs by log_id to -specific exporters. -""" +"""OpenTelemetry logging infrastructure for ZenML.""" +import atexit import concurrent.futures import threading import time from typing import TYPE_CHECKING, Dict, Optional, Sequence -from opentelemetry.sdk._logs import LoggerProvider from opentelemetry.sdk._logs.export import ( BatchLogRecordProcessor, LogExporter, @@ -38,7 +33,7 @@ logger = get_logger(__name__) # Global shared infrastructure (singleton per process) -_shared_logger_provider: Optional[LoggerProvider] = None +_shared_processor: Optional[BatchLogRecordProcessor] = None _routing_exporter: Optional["RoutingLogExporter"] = None _infrastructure_lock = threading.Lock() @@ -74,18 +69,25 @@ def register_exporter(self, log_id: str, exporter: LogExporter) -> None: """ with self._lock: self._exporters[log_id] = exporter - logger.debug(f"Registered exporter for log_id: {log_id}") def unregister_exporter(self, log_id: str) -> None: """Unregister an exporter for a specific log_id. + Also calls shutdown() on the exporter to cleanup resources + and prevent memory leaks. + Args: log_id: The log_id to unregister. """ with self._lock: exporter = self._exporters.pop(log_id, None) if exporter: - logger.debug(f"Unregistered exporter for log_id: {log_id}") + try: + exporter.shutdown() + except Exception as e: + logger.warning( + f"Error shutting down exporter for {log_id}: {e}" + ) def export(self, batch: Sequence["LogData"]) -> LogExportResult: """Route logs to appropriate exporters based on log_id. @@ -126,7 +128,6 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: for log_id, logs in logs_by_id.items(): exporter = self._exporters.get(log_id) if exporter: - # Submit to thread pool (non-blocking) future = self._executor.submit( self._safe_export, exporter, logs, log_id ) @@ -189,11 +190,7 @@ def _safe_export( def shutdown(self) -> None: """Shutdown the routing exporter and thread pool.""" - logger.debug("Shutting down routing exporter thread pool") - try: - self._executor.shutdown(wait=True, timeout=30) - except Exception as e: - logger.warning(f"Error shutting down thread pool: {e}") + self._executor.shutdown(wait=True) def force_flush(self, timeout_millis: int = 30000) -> bool: """Force flush any buffered logs. @@ -204,7 +201,6 @@ def force_flush(self, timeout_millis: int = 30000) -> bool: Returns: True if successful. """ - # Flush all registered exporters in parallel futures = [] with self._lock: for exporter in self._exporters.values(): @@ -213,7 +209,6 @@ def force_flush(self, timeout_millis: int = 30000) -> bool: ) futures.append(future) - # Wait for all flushes all_success = True timeout_sec = timeout_millis / 1000.0 try: @@ -250,58 +245,47 @@ def _safe_flush(self, exporter: LogExporter, timeout_millis: int) -> bool: def get_shared_otel_infrastructure() -> tuple[ - LoggerProvider, RoutingLogExporter + BatchLogRecordProcessor, RoutingLogExporter ]: """Get or create shared OpenTelemetry logging infrastructure. - Creates a single LoggerProvider with BatchLogRecordProcessor and - RoutingLogExporter that all log stores share. + Creates a single BatchLogRecordProcessor with RoutingLogExporter that + all log stores share. Each log store creates its own LoggerProvider + with a unique resource. Returns: - Tuple of (LoggerProvider, RoutingLogExporter). + Tuple of (shared BatchLogRecordProcessor, RoutingLogExporter). """ - global _shared_logger_provider, _routing_exporter + global _shared_processor, _routing_exporter - if _shared_logger_provider is None: + if _shared_processor is None: with _infrastructure_lock: - if _shared_logger_provider is None: - logger.info( - "Initializing shared OTel logging infrastructure " - "with 1 background thread" - ) - - # Create routing exporter - _routing_exporter = RoutingLogExporter() - - # Create shared logger provider - _shared_logger_provider = LoggerProvider() - - # One background thread for all log stores - processor = BatchLogRecordProcessor( - _routing_exporter, - max_queue_size=4096, # Larger for shared use - schedule_delay_millis=1000, # Batch every 1 second - max_export_batch_size=512, # Export in batches of 512 - ) - _shared_logger_provider.add_log_record_processor(processor) + _routing_exporter = RoutingLogExporter() + _shared_processor = BatchLogRecordProcessor( + _routing_exporter, + max_queue_size=4096, + schedule_delay_millis=1000, + max_export_batch_size=512, + ) + atexit.register(shutdown_shared_infrastructure) - return _shared_logger_provider, _routing_exporter + return _shared_processor, _routing_exporter def shutdown_shared_infrastructure() -> None: """Shutdown the shared OpenTelemetry infrastructure. - This should be called on process shutdown to cleanly close all resources. + This is called on process exit via atexit. It shuts down the shared + processor (which stops the background thread) and the routing exporter. """ - global _shared_logger_provider, _routing_exporter + global _shared_processor, _routing_exporter - if _shared_logger_provider: - logger.info("Shutting down shared OTel logging infrastructure") + if _shared_processor: try: - _shared_logger_provider.force_flush() - _shared_logger_provider.shutdown() + _shared_processor.force_flush() + _shared_processor.shutdown() except Exception as e: - logger.warning(f"Error during shutdown: {e}") + logger.warning(f"Error during processor shutdown: {e}") if _routing_exporter: try: @@ -309,5 +293,5 @@ def shutdown_shared_infrastructure() -> None: except Exception as e: logger.warning(f"Error shutting down routing exporter: {e}") - _shared_logger_provider = None + _shared_processor = None _routing_exporter = None diff --git a/src/zenml/log_stores/utils.py b/src/zenml/log_stores/utils.py index 15535a70b5c..37632253281 100644 --- a/src/zenml/log_stores/utils.py +++ b/src/zenml/log_stores/utils.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, List, Optional if TYPE_CHECKING: - from zenml.logging.step_logging import LogEntry + from zenml.logging.logging import LogEntry from zenml.models import LogsResponse from zenml.zen_stores.base_zen_store import BaseZenStore diff --git a/src/zenml/logger.py b/src/zenml/logger.py index 63f78e8fa04..5dc50029538 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -18,7 +18,7 @@ import os import sys from contextvars import ContextVar -from typing import Any, Dict, Optional +from typing import Any, Optional from rich.traceback import install as rich_tb_install @@ -31,42 +31,28 @@ ) from zenml.enums import LoggingLevels -ZENML_LOGGING_COLORS_DISABLED = handle_bool_env_var( - ENV_ZENML_LOGGING_COLORS_DISABLED, False -) - - -# Logic for formatting console messages step_names_in_console: ContextVar[bool] = ContextVar( "step_names_in_console", default=False ) -grey: str = "\x1b[90m" -white: str = "\x1b[37m" -pink: str = "\x1b[35m" -green: str = "\x1b[32m" -yellow: str = "\x1b[33m" -red: str = "\x1b[31m" -cyan: str = "\x1b[1;36m" -bold_red: str = "\x1b[31;1m" -purple: str = "\x1b[38;5;105m" -blue: str = "\x1b[34m" -reset: str = "\x1b[0m" - -COLORS: Dict[LoggingLevels, str] = { - LoggingLevels.DEBUG: grey, - LoggingLevels.INFO: white, - LoggingLevels.WARN: yellow, - LoggingLevels.ERROR: red, - LoggingLevels.CRITICAL: bold_red, -} - _original_stdout_write: Optional[Any] = None _original_stderr_write: Optional[Any] = None _stdout_wrapped: bool = False _stderr_wrapped: bool = False +def get_logger(logger_name: str) -> logging.Logger: + """Main function to get logger name,. + + Args: + logger_name: Name of logger to initialize. + + Returns: + A logger object. + """ + return logging.getLogger(logger_name) + + def _add_step_name_to_message(message: str) -> str: """Adds the step name to the message. @@ -97,29 +83,71 @@ def _add_step_name_to_message(message: str) -> str: return message -def format_console_message(message: str) -> str: - """Format a message for console output. +def format_console_message( + message: str, level: LoggingLevels = LoggingLevels.INFO +) -> str: + """Format a message for console output with colors and step names. + + This function applies: + 1. Step name prefixing (if step_names_in_console is True) + 2. Color formatting (unless ZENML_LOGGING_COLORS_DISABLED) + 3. Special formatting for quoted text (purple) and URLs (blue) Args: message: The message to format. + level: The logging level for color selection. Returns: The formatted message. """ - return message + import re + try: + if step_names_in_console.get(): + message = _add_step_name_to_message(message) + except Exception: + pass -# Logger utilities -def get_logger(logger_name: str) -> logging.Logger: - """Main function to get logger name,. + if handle_bool_env_var(ENV_ZENML_LOGGING_COLORS_DISABLED, False): + return message + + grey = "\x1b[90m" + white = "\x1b[37m" + yellow = "\x1b[33m" + red = "\x1b[31m" + bold_red = "\x1b[31;1m" + purple = "\x1b[38;5;105m" + blue = "\x1b[34m" + reset = "\x1b[0m" + + COLORS = { + LoggingLevels.DEBUG: grey, + LoggingLevels.INFO: white, + LoggingLevels.WARN: yellow, + LoggingLevels.ERROR: red, + LoggingLevels.CRITICAL: bold_red, + } + + level_color = COLORS.get(level, white) + + formatted_message = f"{level_color}{message}{reset}" + + quoted_groups = re.findall("`([^`]*)`", formatted_message) + for quoted in quoted_groups: + formatted_message = formatted_message.replace( + "`" + quoted + "`", + f"{reset}{purple}{quoted}{level_color}", + ) - Args: - logger_name: Name of logger to initialize. + url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" + urls = re.findall(url_pattern, formatted_message) + for url in urls: + formatted_message = formatted_message.replace( + url, + f"{reset}{blue}{url}{level_color}", + ) - Returns: - A logger object. - """ - return logging.getLogger(logger_name) + return formatted_message def get_logging_level() -> LoggingLevels: @@ -173,7 +201,10 @@ def format(self, record: logging.LogRecord) -> str: "timestamp": self.formatTime(record, datefmt="%Y-%m-%dT%H:%M:%S"), "level": record.levelname, "name": record.name, - "message": record.getMessage(), + "msg": record.getMessage(), + "module": record.module, + "filename": record.filename, + "lineno": record.lineno, } if record.exc_info: @@ -184,51 +215,61 @@ def format(self, record: logging.LogRecord) -> str: def _wrapped_write(original_write: Any, stream_name: str) -> Any: """Wrap stdout/stderr write method to parse and route logs.""" - from zenml.logging.step_logging import get_active_log_store, LogEntry - from zenml.utils import utc_now def wrapped_write(text: str) -> int: """Wrap the write method to parse and route logs.""" + from zenml.logging.logging import get_active_log_store + message = text - name = None + name = "unknown" level = ( LoggingLevels.INFO if stream_name == "info" else LoggingLevels.ERROR ) - timestamp = utc_now() + level_int = getattr(logging, level.name) + pathname = "" + lineno = 0 + funcName = "" - # Try to extract the message from a potential JSONified log entry - if text.startswith("{") and text.endswith("}"): - try: - data = json.loads(text) - - if "zenml" in data and "message" in data: - message = data["message"] - name = data.get("name", None) - level = data.get("level", level) - timestamp = data.get("timestamp", timestamp) - else: - message = data + has_newline = text.endswith("\n") + stripped_text = text.strip() + if stripped_text.startswith("{") and stripped_text.endswith("}"): + try: + data = json.loads(stripped_text) + if "zenml" in data and data["zenml"] is True: + message = data.get("msg", text) + name = data.get("name", name) + level_str = data.get("level", level.name) + if hasattr(LoggingLevels, level_str): + level = getattr(LoggingLevels, level_str) + level_int = getattr(logging, level.name) + pathname = data.get("filename", pathname) + lineno = data.get("lineno", lineno) + funcName = data.get("module", funcName) except Exception: - message = text - - # If there is an active log store - if log_store := get_active_log_store(): - log_store.emit( - LogEntry( - message=message, - name=name, - level=level, - timestamp=timestamp, - ) + pass + + log_store = get_active_log_store() + if log_store: + record = logging.LogRecord( + name=name, + level=level_int, + pathname=pathname, + lineno=lineno, + msg=message, + args=(), + exc_info=None, + func=funcName, ) + log_store.emit(record) - # Format the message for console output - message = format_console_message(message) + formatted_message = format_console_message(message, level) + if has_newline: + formatted_message += "\n" - return original_write(message) + return original_write(formatted_message) return wrapped_write diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/logging.py similarity index 71% rename from src/zenml/logging/step_logging.py rename to src/zenml/logging/logging.py index 3797e89eb0c..3d47c5b9abc 100644 --- a/src/zenml/logging/step_logging.py +++ b/src/zenml/logging/logging.py @@ -11,9 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""ZenML logging handler.""" +"""ZenML logging.""" -import re from contextlib import nullcontext from contextvars import ContextVar from datetime import datetime @@ -22,7 +21,9 @@ TYPE_CHECKING, Any, Optional, + Tuple, Type, + Union, ) from uuid import UUID, uuid4 @@ -34,55 +35,76 @@ handle_bool_env_var, ) from zenml.enums import LoggingLevels -from zenml.logger import ( - get_logger, -) -from zenml.models import ( - LogsRequest, - PipelineSnapshotResponse, -) +from zenml.logger import get_logger +from zenml.models import LogsRequest, PipelineSnapshotResponse from zenml.utils.time_utils import utc_now +logger = get_logger(__name__) + if TYPE_CHECKING: from zenml.log_stores.base_log_store import BaseLogStore + from zenml.models import LogsRequest, LogsResponse -# Active log store per thread -_active_log_store: ContextVar[Optional["BaseLogStore"]] = ContextVar( - "active_log_store", default=None -) +# Maximum number of log entries to return in a single request +MAX_ENTRIES_PER_REQUEST = 20000 +# Maximum size of a single log message in bytes (5KB) +DEFAULT_MESSAGE_SIZE = 5 * 1024 + +# Active log store and its associated log model +_active_log_context: ContextVar[ + Optional[Tuple["BaseLogStore", Union["LogsRequest", "LogsResponse"]]] +] = ContextVar("active_log_context", default=None) -def set_active_log_store(log_store: Optional["BaseLogStore"]) -> None: - """Set active log store for current thread. +def set_active_log_context( + log_store: Optional["BaseLogStore"], + log_model: Optional[Union["LogsRequest", "LogsResponse"]] = None, +) -> None: + """Set active log store and model for current context. Args: log_store: Log store to activate, or None to deactivate. + log_model: The log model associated with this context. """ - _active_log_store.set(log_store) + if log_store is None: + _active_log_context.set(None) + else: + if log_model is None: + raise ValueError( + "log_model must be provided when log_store is set" + ) + _active_log_context.set((log_store, log_model)) -def get_active_log_store() -> Optional["BaseLogStore"]: - """Get the active log store for the current thread. +def get_active_log_context() -> Optional[ + Tuple["BaseLogStore", Union["LogsRequest", "LogsResponse"]] +]: + """Get the active log store and model for the current context. Returns: - The active log store, or None if no log store is active. + Tuple of (log_store, log_model), or None if no context is active. """ - return _active_log_store.get() + return _active_log_context.get() -logger = get_logger(__name__) +def get_active_log_store() -> Optional["BaseLogStore"]: + """Get the active log store for the current context. -# Context variables -redirected: ContextVar[bool] = ContextVar("redirected", default=False) + Returns: + The active log store, or None if no log store is active. + """ + context = _active_log_context.get() + return context[0] if context else None -ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") -PIPELINE_RUN_LOGS_FOLDER = "pipeline_runs" +def get_active_log_model() -> Optional[Union["LogsRequest", "LogsResponse"]]: + """Get the active log model for the current context. -# Maximum number of log entries to return in a single request -MAX_ENTRIES_PER_REQUEST = 20000 -# Maximum size of a single log message in bytes (5KB) -DEFAULT_MESSAGE_SIZE = 5 * 1024 + Returns: + The active log model, or None if no context is active. + """ + context = _active_log_context.get() + return context[1] if context else None class LogEntry(BaseModel): @@ -132,9 +154,9 @@ def __init__(self, source: str) -> None: """Initialize the logging context. Args: - source: An identifier for the source of the logs (e.g., "step", "orchestrator") + source: An identifier for the source of the logs + (e.g., "step", "orchestrator") """ - # Create the log store first if Client().active_stack.log_store: self.log_store = Client().active_stack.log_store else: @@ -159,10 +181,14 @@ def __init__(self, source: str) -> None: secrets=[], ) - # Based on the source, generate the log request self.source = source self.log_request = self.generate_log_request() + self._previous_log_context: Optional[ + Tuple["BaseLogStore", Union["LogsRequest", "LogsResponse"]] + ] = None + self._is_outermost_context: bool = False + def generate_log_request(self) -> "LogsRequest": """Create a log request model. @@ -197,10 +223,19 @@ def generate_log_request(self) -> "LogsRequest": def __enter__(self) -> "LoggingContext": """Enter the context and activate log collection. + Saves the current active context to restore it on exit, + enabling nested logging contexts. + Returns: self """ - self.log_store.activate(log_request=self.log_request) + self._previous_log_context = get_active_log_context() + + # Set the active context before activating the log store + # so that activate() can access the log model from context + set_active_log_context(self.log_store, self.log_request) + self.log_store.activate() + return self def __exit__( @@ -211,6 +246,8 @@ def __exit__( ) -> None: """Exit the context and deactivate log collection. + Restores the previous active context to support nested contexts. + Args: exc_type: The class of the exception. exc_val: The instance of the exception. @@ -224,6 +261,14 @@ def __exit__( self.log_store.deactivate() + if self._previous_log_context: + set_active_log_context( + self._previous_log_context[0], + self._previous_log_context[1], + ) + else: + set_active_log_context(None) + def setup_orchestrator_logging( run_id: UUID, diff --git a/src/zenml/orchestrators/local/local_orchestrator.py b/src/zenml/orchestrators/local/local_orchestrator.py index e45c4c1918d..11e56bd9e17 100644 --- a/src/zenml/orchestrators/local/local_orchestrator.py +++ b/src/zenml/orchestrators/local/local_orchestrator.py @@ -19,6 +19,7 @@ from zenml.enums import ExecutionMode from zenml.logger import get_logger +from zenml.logging.logging import setup_orchestrator_logging from zenml.orchestrators import ( BaseOrchestrator, BaseOrchestratorConfig, @@ -104,94 +105,106 @@ def submit_pipeline( ) self._orchestrator_run_id = str(uuid4()) - start_time = time.time() - - execution_mode = snapshot.pipeline_configuration.execution_mode - - failed_steps: List[str] = [] - step_exception: Optional[Exception] = None - skipped_steps: List[str] = [] - - self.run_init_hook(snapshot=snapshot) - - # Run each step - for step_name, step in snapshot.step_configurations.items(): - if ( - execution_mode == ExecutionMode.STOP_ON_FAILURE - and failed_steps - ): - logger.warning( - "Skipping step %s due to the failed step(s): %s (Execution mode %s)", - step_name, - ", ".join(failed_steps), - execution_mode, - ) - skipped_steps.append(step_name) - continue - - if failed_upstream_steps := [ - fs for fs in failed_steps if fs in step.spec.upstream_steps - ]: - logger.warning( - "Skipping step %s due to failure in upstream step(s): %s (Execution mode %s)", - step_name, - ", ".join(failed_upstream_steps), - execution_mode, - ) - skipped_steps.append(step_name) - continue - - if skipped_upstream_steps := [ - fs for fs in skipped_steps if fs in step.spec.upstream_steps - ]: - logger.warning( - "Skipping step %s due to the skipped upstream step(s) %s (Execution mode %s)", - step_name, - ", ".join(skipped_upstream_steps), - execution_mode, - ) - skipped_steps.append(step_name) - continue - - if self.requires_resources_in_orchestration_environment(step): - logger.warning( - "Specifying step resources is not supported for the local " - "orchestrator, ignoring resource configuration for " - "step %s.", - step_name, - ) - - step_environment = step_environments[step_name] - try: - with temporary_environment(step_environment): - self.run_step(step=step) - except Exception as e: - failed_steps.append(step_name) - logger.exception("Step %s failed.", step_name) - if execution_mode == ExecutionMode.FAIL_FAST: - step_exception = e - break - - self.run_cleanup_hook(snapshot=snapshot) + # Setup orchestrator logging context (if enabled) + logs_context = setup_orchestrator_logging( + run_id=placeholder_run.id + if placeholder_run + else self._orchestrator_run_id, # type: ignore[arg-type] + snapshot=snapshot, + ) - if execution_mode == ExecutionMode.FAIL_FAST and failed_steps: - assert step_exception is not None - raise step_exception + with logs_context: + start_time = time.time() + + execution_mode = snapshot.pipeline_configuration.execution_mode + + failed_steps: List[str] = [] + step_exception: Optional[Exception] = None + skipped_steps: List[str] = [] + + self.run_init_hook(snapshot=snapshot) + + # Run each step + for step_name, step in snapshot.step_configurations.items(): + if ( + execution_mode == ExecutionMode.STOP_ON_FAILURE + and failed_steps + ): + logger.warning( + "Skipping step %s due to the failed step(s): %s (Execution mode %s)", + step_name, + ", ".join(failed_steps), + execution_mode, + ) + skipped_steps.append(step_name) + continue + + if failed_upstream_steps := [ + fs for fs in failed_steps if fs in step.spec.upstream_steps + ]: + logger.warning( + "Skipping step %s due to failure in upstream step(s): %s (Execution mode %s)", + step_name, + ", ".join(failed_upstream_steps), + execution_mode, + ) + skipped_steps.append(step_name) + continue + + if skipped_upstream_steps := [ + fs + for fs in skipped_steps + if fs in step.spec.upstream_steps + ]: + logger.warning( + "Skipping step %s due to the skipped upstream step(s) %s (Execution mode %s)", + step_name, + ", ".join(skipped_upstream_steps), + execution_mode, + ) + skipped_steps.append(step_name) + continue + + if self.requires_resources_in_orchestration_environment(step): + logger.warning( + "Specifying step resources is not supported for the local " + "orchestrator, ignoring resource configuration for " + "step %s.", + step_name, + ) + + step_environment = step_environments[step_name] + try: + with temporary_environment(step_environment): + self.run_step(step=step) + except Exception as e: + failed_steps.append(step_name) + logger.exception("Step %s failed.", step_name) + + if execution_mode == ExecutionMode.FAIL_FAST: + step_exception = e + break + + self.run_cleanup_hook(snapshot=snapshot) + + if execution_mode == ExecutionMode.FAIL_FAST and failed_steps: + assert step_exception is not None + raise step_exception + + if failed_steps: + raise RuntimeError( + "Pipeline run has failed due to failure in step(s): " + f"{', '.join(failed_steps)}" + ) - if failed_steps: - raise RuntimeError( - "Pipeline run has failed due to failure in step(s): " - f"{', '.join(failed_steps)}" + run_duration = time.time() - start_time + logger.info( + "Pipeline run has finished in `%s`.", + string_utils.get_human_readable_time(run_duration), ) - - run_duration = time.time() - start_time - logger.info( - "Pipeline run has finished in `%s`.", - string_utils.get_human_readable_time(run_duration), - ) - self._orchestrator_run_id = None - return None + self._orchestrator_run_id = None + return None def get_orchestrator_run_id(self) -> str: """Returns the active orchestrator run id. diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 132e781ebf5..1e2a871e24f 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -30,7 +30,7 @@ from zenml.environment import get_run_environment_dict from zenml.exceptions import RunInterruptedException, RunStoppedException from zenml.logger import get_logger -from zenml.logging import step_logging +from zenml.logging import logging as step_logging from zenml.models import ( PipelineRunRequest, PipelineRunResponse, diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index be5c75d3e47..013ecfbd266 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -41,7 +41,7 @@ from zenml.exceptions import StepInterfaceError from zenml.hooks.hook_validators import load_and_run_hook from zenml.logger import get_logger -from zenml.logging.step_logging import LoggingContext, redirected +from zenml.logging.logging import LoggingContext, redirected from zenml.materializers.base_materializer import BaseMaterializer from zenml.materializers.in_memory_materializer import InMemoryMaterializer from zenml.models.v2.core.step_run import ( diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index a0d012747a1..37f8243cc5d 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -58,7 +58,7 @@ from zenml.exceptions import EntityExistsError from zenml.hooks.hook_validators import resolve_and_validate_hook from zenml.logger import get_logger -from zenml.logging.step_logging import LoggingContext +from zenml.logging.logging import LoggingContext from zenml.models import ( CodeReferenceRequest, DeploymentResponse, diff --git a/src/zenml/zen_server/routers/runs_endpoints.py b/src/zenml/zen_server/routers/runs_endpoints.py index 9b1b02559d6..c0a7b1ec26c 100644 --- a/src/zenml/zen_server/routers/runs_endpoints.py +++ b/src/zenml/zen_server/routers/runs_endpoints.py @@ -32,7 +32,7 @@ from zenml.enums import ExecutionStatus from zenml.log_stores import fetch_logs from zenml.logger import get_logger -from zenml.logging.step_logging import ( +from zenml.logging.logging import ( MAX_ENTRIES_PER_REQUEST, LogEntry, ) diff --git a/src/zenml/zen_server/routers/steps_endpoints.py b/src/zenml/zen_server/routers/steps_endpoints.py index b7c2ef2abbc..9d29318bd2f 100644 --- a/src/zenml/zen_server/routers/steps_endpoints.py +++ b/src/zenml/zen_server/routers/steps_endpoints.py @@ -28,7 +28,7 @@ ) from zenml.enums import ExecutionStatus from zenml.log_stores import fetch_logs -from zenml.logging.step_logging import ( +from zenml.logging.logging import ( MAX_ENTRIES_PER_REQUEST, LogEntry, ) diff --git a/src/zenml/zen_stores/sql_zen_store.py b/src/zenml/zen_stores/sql_zen_store.py index 3f7145ff17c..b95df7d9e65 100644 --- a/src/zenml/zen_stores/sql_zen_store.py +++ b/src/zenml/zen_stores/sql_zen_store.py @@ -164,7 +164,7 @@ SecretsStoreNotConfiguredError, ) from zenml.io import fileio -from zenml.logger import get_console_handler, get_logger, get_logging_level +from zenml.logger import get_logger, get_logging_level, get_zenml_handler from zenml.metadata.metadata_types import get_metadata_type from zenml.models import ( ActionFilter, @@ -1581,7 +1581,7 @@ def migrate_database(self) -> None: else: alembic_logger.setLevel(logging.WARNING) - alembic_logger.addHandler(get_console_handler()) + alembic_logger.addHandler(get_zenml_handler()) # We need to account for 3 distinct cases here: # 1. the database is completely empty (not initialized) From 443d92e313a9f5f0816b333a3023bcfd33c1b649 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 9 Nov 2025 22:44:24 +0100 Subject: [PATCH 15/81] removing docs for now --- .../book/component-guide/log-stores/README.md | 90 ----- .../book/component-guide/log-stores/custom.md | 342 ------------------ .../component-guide/log-stores/datadog.md | 104 ------ docs/book/component-guide/log-stores/otel.md | 81 ----- docs/book/component-guide/toc.md | 4 - 5 files changed, 621 deletions(-) delete mode 100644 docs/book/component-guide/log-stores/README.md delete mode 100644 docs/book/component-guide/log-stores/custom.md delete mode 100644 docs/book/component-guide/log-stores/datadog.md delete mode 100644 docs/book/component-guide/log-stores/otel.md diff --git a/docs/book/component-guide/log-stores/README.md b/docs/book/component-guide/log-stores/README.md deleted file mode 100644 index 44c493d333f..00000000000 --- a/docs/book/component-guide/log-stores/README.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -description: Collecting and storing logs from your pipeline runs. -icon: file-lines ---- - -# Log Stores - -The Log Store is a stack component that handles the collection, storage, and retrieval of logs generated during pipeline and step execution. It provides a centralized way to manage logs across different backends. - -ZenML automatically captures logs from your pipeline runs, including stdout, stderr, and any logging output from your steps. The Log Store determines where these logs are stored and how they can be retrieved later for debugging and monitoring. - -{% hint style="info" %} -By default, if no Log Store is configured in your stack, ZenML will automatically use the Artifact Store as a fallback location for storing logs. This ensures backward compatibility and that logs are always captured without requiring additional configuration. -{% endhint %} - -### When to use it - -The Log Store is an optional component in the ZenML stack. While ZenML provides a default fallback mechanism (using the Artifact Store), you may want to configure a dedicated Log Store when you need: - -* **Centralized logging infrastructure**: Send logs to your existing logging platform (e.g., Datadog, Elasticsearch, Splunk) -* **Real-time log streaming**: View logs as they are generated during pipeline execution -* **Advanced log analysis**: Use specialized logging platforms for searching, filtering, and analyzing logs -* **Compliance requirements**: Store logs in specific systems for regulatory or audit purposes - -#### Log Store Flavors - -Out of the box, ZenML provides several Log Store implementations: - -| Log Store | Flavor | Integration | Notes | -| ------------------ | ---------- | ----------- | ----------------------------------------------------------------------------- | -| [OpenTelemetry](otel.md) | `otel` | _built-in_ | Generic OpenTelemetry-based log store that can export to various backends | -| [Datadog](datadog.md) | `datadog` | _built-in_ | Sends logs directly to Datadog's logging platform | -| [Custom Implementation](custom.md) | _custom_ | | Extend the Log Store abstraction and provide your own implementation | - -If you would like to see the available flavors of Log Stores, you can use the command: - -```shell -zenml log-store flavor list -``` - -### How to use it - -The Log Store works automatically once configured in your stack. You don't need to make any changes to your pipeline code. All logging output, print statements, and errors are automatically captured and sent to the configured Log Store. - -#### Basic Setup - -To register and configure a Log Store: - -```shell -# Register a log store -zenml log-store register my_datadog_logs --flavor datadog \ - --api_key= \ - --site=datadoghq.com - -# Add it to your stack -zenml stack update -l my_datadog_logs -``` - -Once configured, all subsequent pipeline runs will send their logs to the configured Log Store. - -#### Viewing Logs - -Logs can be viewed through: - -1. **ZenML Dashboard**: View logs directly in the pipeline run UI -2. **CLI**: Use `zenml logs` commands to fetch and display logs -3. **External Platform**: Access logs directly in your logging platform (e.g., Datadog UI) - -#### Log Metadata - -All logs captured by ZenML include important metadata: - -* `pipeline_run_id`: The unique identifier of the pipeline run -* `step_id`: The unique identifier of the step (if applicable) -* `source`: Where the logs originated from (e.g., "step", "orchestrator") - -This metadata allows you to filter and query logs effectively in your logging platform. - -#### Fallback Behavior - -If no Log Store is configured in your stack, ZenML will: - -1. Automatically use the Artifact Store as the storage backend -2. Save logs as files in the artifact store -3. Make logs accessible through the same APIs and UI - -This ensures that logs are always captured and retrievable, even without explicit Log Store configuration. - -
ZenML Scarf
- diff --git a/docs/book/component-guide/log-stores/custom.md b/docs/book/component-guide/log-stores/custom.md deleted file mode 100644 index 5f4474d7a40..00000000000 --- a/docs/book/component-guide/log-stores/custom.md +++ /dev/null @@ -1,342 +0,0 @@ ---- -description: Developing a custom log store. ---- - -# Develop a Custom Log Store - -If you want to send logs to a backend that isn't covered by the built-in log stores, you can create your own custom log store implementation. - -### Base Abstraction - -The `BaseLogStore` provides three main methods that you need to implement: - -```python -import logging -from zenml.log_stores import BaseLogStore, BaseLogStoreConfig -from zenml.models import LogsRequest - -class MyLogStoreConfig(BaseLogStoreConfig): - """Configuration for my custom log store.""" - - my_setting: str - another_setting: int = 100 - -class MyLogStore(BaseLogStore): - """My custom log store implementation.""" - - @property - def config(self) -> MyLogStoreConfig: - return cast(MyLogStoreConfig, self._config) - - def activate(self, log_request: LogsRequest) -> None: - """Activate log collection. - - This is called at the start of a pipeline run or step. - Set up your logging handlers, connections, and register - with the routing handler. - - Args: - log_request: Contains log ID, URI, and metadata. - """ - from zenml.logging.routing_handler import ( - ensure_routing_handler_installed, - set_active_log_store, - ) - - # Ensure global routing handler is installed - ensure_routing_handler_installed() - - # Initialize your backend connection - self._setup_backend(log_request) - - # Register this log store for current thread - set_active_log_store(self) - - def emit(self, record: logging.LogRecord) -> None: - """Process a log record. - - This is called by the routing handler for each log message. - Send the log to your backend. You can safely use print() - or logger.info() here - reentrancy protection prevents loops. - - Args: - record: The log record to process. - """ - # Send log to your backend - self._send_to_backend(record) - - def deactivate(self) -> None: - """Deactivate log collection and clean up. - - This is called at the end of a pipeline run or step. - Flush any pending logs, close connections, and unregister. - """ - from zenml.logging.routing_handler import set_active_log_store - - # Unregister from routing handler - set_active_log_store(None) - - # Clean up your backend connection - self._cleanup_backend() - - def fetch( - self, - logs_model: LogsResponse, - start_time: Optional[datetime] = None, - end_time: Optional[datetime] = None, - limit: int = 20000, - ) -> List[LogEntry]: - """Fetch logs from the backend. - - This is called by the server to retrieve logs for display. - Query your backend and return logs as LogEntry objects. - - Args: - logs_model: Contains pipeline_run_id, step_id, and metadata. - start_time: Filter logs after this time. - end_time: Filter logs before this time. - limit: Maximum number of logs to return. - - Returns: - List of log entries. - """ - return [] -``` - -### Implementation Patterns - -#### 1. Direct Implementation (Simple) - -The simplest pattern is to directly implement the `emit()` method: - -```python -import logging -from zenml.log_stores import BaseLogStore - -class MyLogStore(BaseLogStore): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.backend_client = None - - def activate(self, log_request): - from zenml.logging.routing_handler import ( - ensure_routing_handler_installed, - set_active_log_store, - ) - - # Install routing handler - ensure_routing_handler_installed() - - # Set up backend connection - self.backend_client = MyBackendClient( - url=self.config.backend_url, - log_id=log_request.id, - ) - - # Register for current thread - set_active_log_store(self) - - def emit(self, record): - """Process each log record.""" - # You can safely use print() or logger.info() here! - # Reentrancy protection prevents infinite loops. - - log_data = { - "message": record.getMessage(), - "level": record.levelname, - "timestamp": record.created, - } - - self.backend_client.send_log(log_data) - - def deactivate(self): - from zenml.logging.routing_handler import set_active_log_store - - if self.backend_client: - self.backend_client.close() - - set_active_log_store(None) -``` - -#### 2. Using Internal Handlers (Advanced) - -If you want to use Python's logging.Handler internally: - -```python -class MyLogStore(BaseLogStore): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._handler = None - - def activate(self, log_request): - from zenml.logging.routing_handler import ( - ensure_routing_handler_installed, - set_active_log_store, - ) - - ensure_routing_handler_installed() - - # Create internal handler (not added to root logger) - self._handler = MyCustomHandler( - backend_url=self.config.backend_url, - log_id=log_request.id, - ) - - set_active_log_store(self) - - def emit(self, record): - """Delegate to internal handler.""" - if self._handler: - self._handler.emit(record) - - def deactivate(self): - from zenml.logging.routing_handler import set_active_log_store - - if self._handler: - self._handler.flush() - self._handler.close() - - set_active_log_store(None) -``` - -#### 3. Background Processing - -For efficient log handling, use background threads for batching: - -```python -import queue -import threading - -class MyLogStore(BaseLogStore): - def activate(self, log_request): - from zenml.logging.routing_handler import ( - ensure_routing_handler_installed, - set_active_log_store, - ) - - ensure_routing_handler_installed() - - self.log_queue = queue.Queue(maxsize=2048) - self.shutdown_event = threading.Event() - self.worker_thread = threading.Thread( - target=self._process_logs, - daemon=True - ) - self.worker_thread.start() - - set_active_log_store(self) - - def emit(self, record): - """Queue logs for background processing.""" - try: - self.log_queue.put_nowait(record) - except queue.Full: - pass # Drop logs if queue is full - - def _process_logs(self): - """Background thread processes queued logs.""" - while not self.shutdown_event.is_set(): - try: - record = self.log_queue.get(timeout=1) - self._send_to_backend(record) - except queue.Empty: - continue - - def deactivate(self): - from zenml.logging.routing_handler import set_active_log_store - - self.shutdown_event.set() - if self.worker_thread: - self.worker_thread.join(timeout=5) - - set_active_log_store(None) -``` - -#### 4. Fetching Logs - -Implement fetch using HTTP APIs or SDKs: - -```python -import requests -from zenml.logging.step_logging import LogEntry - -class MyLogStore(BaseLogStore): - def fetch( - self, - logs_model, - start_time=None, - end_time=None, - limit=20000, - ): - """Fetch logs from your backend.""" - query = { - "pipeline_run_id": str(logs_model.pipeline_run_id), - } - - if logs_model.step_run_id: - query["step_id"] = str(logs_model.step_run_id) - if start_time: - query["start_time"] = start_time.isoformat() - if end_time: - query["end_time"] = end_time.isoformat() - - response = requests.post( - f"{self.config.backend_url}/query", - json=query, - headers={"Authorization": f"Bearer {self.config.api_key}"} - ) - - logs = [] - for log_data in response.json()["logs"][:limit]: - logs.append(LogEntry( - message=log_data["message"], - level=log_data.get("level"), - timestamp=log_data.get("timestamp"), - )) - return logs -``` - -### Creating a Flavor - -To make your log store usable via CLI, create a flavor: - -```python -from zenml.enums import StackComponentType -from zenml.stack.flavor import Flavor - -class MyLogStoreFlavor(Flavor): - @property - def name(self) -> str: - return "my_custom_store" - - @property - def type(self) -> StackComponentType: - return StackComponentType.LOG_STORE - - @property - def config_class(self) -> Type[BaseLogStoreConfig]: - from my_module import MyLogStoreConfig - return MyLogStoreConfig - - @property - def implementation_class(self) -> Type[BaseLogStore]: - from my_module import MyLogStore - return MyLogStore -``` - -Register your flavor: - -```bash -zenml log-store flavor register my_module.MyLogStoreFlavor -``` - -Then use it: - -```bash -zenml log-store register my_logs --flavor my_custom_store \ - --backend_url=https://logs.example.com \ - --api_key=secret -``` - -
ZenML Scarf
- diff --git a/docs/book/component-guide/log-stores/datadog.md b/docs/book/component-guide/log-stores/datadog.md deleted file mode 100644 index 676094addbc..00000000000 --- a/docs/book/component-guide/log-stores/datadog.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -description: Sending logs directly to Datadog. ---- - -# Datadog Log Store - -The Datadog Log Store sends logs directly to Datadog's logging platform using their HTTP intake API. It extends the OpenTelemetry Log Store and adds Datadog-specific formatting and tagging. - -### When to use it - -Use the Datadog Log Store when you: - -* Already use Datadog for monitoring and observability -* Want to correlate pipeline logs with other Datadog metrics and traces -* Need advanced log analysis and visualization features -* Want to set up alerts based on pipeline log patterns - -### How to deploy it - -The Datadog Log Store requires the OpenTelemetry SDK to be installed: - -```bash -pip install opentelemetry-sdk requests -``` - -### How to use it - -First, obtain your Datadog API key from the Datadog UI (Organization Settings → API Keys). - -Register a Datadog log store: - -```bash -zenml log-store register datadog_logs --flavor datadog \ - --api_key= \ - --site=datadoghq.com -``` - -For EU customers, use `datadoghq.eu`: - -```bash -zenml log-store register datadog_logs --flavor datadog \ - --api_key= \ - --site=datadoghq.eu -``` - -Add it to your stack: - -```bash -zenml stack update -l datadog_logs -``` - -#### Configuration Options - -The Datadog Log Store supports all OpenTelemetry Log Store options plus: - -* `api_key`: Your Datadog API key (required) -* `site`: The Datadog site (default: "datadoghq.com") - * US: `datadoghq.com` - * EU: `datadoghq.eu` - * Other regions: check Datadog documentation -* `additional_tags`: Additional tags to add to all logs (optional) - -#### Log Tags - -All logs sent to Datadog include the following tags for easy filtering: - -* `service:`: The service name from your configuration -* `zenml.pipeline_run_id:`: The pipeline run identifier -* `zenml.step_id:`: The step identifier (if applicable) -* `zenml.source:`: The log source ("step" or "orchestrator") -* `deployment.environment:`: The deployment environment - -#### Viewing Logs in Datadog - -Once configured, logs will appear in the Datadog Logs Explorer. You can: - -1. Go to Datadog → Logs → Search -2. Filter by service: `service:zenml-pipelines` -3. Filter by pipeline: `zenml.pipeline_run_id:` -4. Filter by step: `zenml.step_id:` - -#### Example: Production Setup - -```bash -zenml log-store register prod_datadog_logs --flavor datadog \ - --api_key=$DATADOG_API_KEY \ - --site=datadoghq.com \ - --service_name=ml-pipelines \ - --deployment_environment=production \ - --additional_tags='{"team":"ml-platform","project":"recommendation-system"}' -``` - -#### Setting Up Alerts - -In Datadog, you can create log-based alerts: - -1. Go to Datadog → Logs → Configuration → Log Alerts -2. Create a new monitor -3. Set the query to filter your pipeline logs (e.g., `service:zenml-pipelines @zenml.pipeline_run_id:*`) -4. Define alert conditions (e.g., error rate threshold) -5. Configure notifications - -
ZenML Scarf
- diff --git a/docs/book/component-guide/log-stores/otel.md b/docs/book/component-guide/log-stores/otel.md deleted file mode 100644 index 58055146c54..00000000000 --- a/docs/book/component-guide/log-stores/otel.md +++ /dev/null @@ -1,81 +0,0 @@ ---- -description: Exporting logs using OpenTelemetry. ---- - -# OpenTelemetry Log Store - -The OpenTelemetry Log Store allows you to export logs to any OpenTelemetry-compatible backend. It uses the OpenTelemetry SDK to collect and export logs with structured metadata. - -### When to use it - -Use the OpenTelemetry Log Store when you: - -* Want to send logs to any OpenTelemetry-compatible backend -* Need structured logging with rich metadata -* Want to integrate with your existing OpenTelemetry infrastructure -* Need flexibility to change backends without changing your ZenML configuration - -### How to deploy it - -The OpenTelemetry Log Store requires the OpenTelemetry SDK to be installed: - -```bash -pip install opentelemetry-sdk opentelemetry-exporter-otlp -``` - -### How to use it - -Register an OpenTelemetry log store: - -```bash -zenml log-store register otel_logs --flavor otel \ - --service_name=zenml-pipelines \ - --endpoint=http://otel-collector:4318 -``` - -Add it to your stack: - -```bash -zenml stack update -l otel_logs -``` - -#### Configuration Options - -The OpenTelemetry Log Store supports the following configuration options: - -* `service_name`: The name of your service (default: "zenml") -* `service_version`: The version of your service (default: "1.0.0") -* `deployment_environment`: The deployment environment (default: "production") -* `endpoint`: The OTLP endpoint URL (optional) -* `headers`: Custom headers to send with log exports (optional) -* `insecure`: Whether to use an insecure connection (default: False) -* `max_queue_size`: Maximum queue size for batch processing (default: 2048) -* `schedule_delay_millis`: Export interval in milliseconds (default: 1000) -* `max_export_batch_size`: Maximum batch size for exports (default: 512) - -#### Resource Attributes - -All logs exported through the OpenTelemetry Log Store include the following resource attributes: - -* `service.name`: The configured service name -* `service.version`: The configured service version -* `service.instance.id`: A unique instance identifier -* `deployment.environment`: The deployment environment -* `zenml.pipeline_run_id`: The pipeline run UUID -* `zenml.step_id`: The step UUID (if applicable) -* `zenml.source`: The log source ("step" or "orchestrator") - -These attributes allow you to filter and aggregate logs by pipeline, step, or environment in your observability platform. - -#### Example: Using with an OTLP Collector - -```bash -zenml log-store register my_otel_logs --flavor otel \ - --service_name=my-ml-pipelines \ - --deployment_environment=production \ - --endpoint=https://otlp-collector.example.com:4318 \ - --headers='{"Authorization":"Bearer token123"}' -``` - -
ZenML Scarf
- diff --git a/docs/book/component-guide/toc.md b/docs/book/component-guide/toc.md index 9db7fee2d0c..53a867ed184 100644 --- a/docs/book/component-guide/toc.md +++ b/docs/book/component-guide/toc.md @@ -32,10 +32,6 @@ * [Google Cloud Storage (GCS)](artifact-stores/gcp.md) * [Azure Blob Storage](artifact-stores/azure.md) * [Develop a custom artifact store](artifact-stores/custom.md) -* [Log Stores](log-stores/README.md) - * [OpenTelemetry](log-stores/otel.md) - * [Datadog](log-stores/datadog.md) - * [Develop a custom log store](log-stores/custom.md) * [Container Registries](container-registries/README.md) * [Default Container Registry](container-registries/default.md) * [DockerHub](container-registries/dockerhub.md) From cc39e3413b84a8c273e474e2497b1097a7112717 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 9 Nov 2025 22:57:01 +0100 Subject: [PATCH 16/81] some fixes --- .../log_stores/datadog/datadog_flavor.py | 3 +- .../default/artifact_store_exporter.py | 9 +----- src/zenml/logging/logging.py | 32 ++++++++++++------- src/zenml/logging/step_logging.py | 6 +--- src/zenml/orchestrators/step_launcher.py | 2 +- src/zenml/pipelines/pipeline_definition.py | 2 +- .../zen_server/routers/steps_endpoints.py | 2 +- .../versions/5c0a1c787128_add_log_stores.py | 4 +-- 8 files changed, 29 insertions(+), 31 deletions(-) diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index 3c68c27e776..eea2f975acf 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -27,8 +27,6 @@ class DatadogLogStoreConfig(OtelLogStoreConfig): """Configuration for Datadog log store. - This extends OtelLogStoreConfig with Datadog-specific settings. - Attributes: api_key: Datadog API key for log ingestion. site: Datadog site (e.g., "datadoghq.com", "datadoghq.eu"). @@ -73,6 +71,7 @@ def sdk_docs_url(self) -> str: """ return self.docs_url + # TODO: Add logo for the Datadog log store @property def logo_url(self) -> str: """URL to the flavor logo. diff --git a/src/zenml/log_stores/default/artifact_store_exporter.py b/src/zenml/log_stores/default/artifact_store_exporter.py index 5126ae89223..cc3ada297fe 100644 --- a/src/zenml/log_stores/default/artifact_store_exporter.py +++ b/src/zenml/log_stores/default/artifact_store_exporter.py @@ -11,11 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""OpenTelemetry exporter that writes logs to ZenML artifact store. - -This implementation reuses the proven logic from the original step_logging.py -implementation, including message chunking and JSON line formatting. -""" +"""OpenTelemetry exporter that writes logs to ZenML artifact store.""" import time from typing import TYPE_CHECKING, List, Sequence @@ -29,8 +25,6 @@ from zenml.artifact_stores import BaseArtifactStore from zenml.enums import LoggingLevels - -# Import from default_log_store to avoid duplication from zenml.log_stores.default.default_log_store import remove_ansi_escape_codes from zenml.logger import get_logger from zenml.logging.logging import DEFAULT_MESSAGE_SIZE, LogEntry @@ -114,7 +108,6 @@ def _otel_record_to_log_entries( level = self._map_severity_to_level(log_record.severity_text) - name = "unknown" module = None filename = None diff --git a/src/zenml/logging/logging.py b/src/zenml/logging/logging.py index 98e78867e84..f124d22e66b 100644 --- a/src/zenml/logging/logging.py +++ b/src/zenml/logging/logging.py @@ -13,18 +13,18 @@ # permissions and limitations under the License. """ZenML logging.""" -from contextlib import nullcontext, contextmanager +from contextlib import contextmanager, nullcontext from contextvars import ContextVar from datetime import datetime from types import TracebackType from typing import ( TYPE_CHECKING, Any, + Generator, Optional, Tuple, Type, Union, - Generator, ) from uuid import UUID, uuid4 @@ -37,8 +37,12 @@ ) from zenml.enums import LoggingLevels from zenml.logger import get_logger -from zenml.models import LogsRequest, PipelineSnapshotResponse, PipelineRunUpdate -from zenml.logging import LoggingContext +from zenml.models import ( + LogsRequest, + LogsResponse, + PipelineRunUpdate, + PipelineSnapshotResponse, +) from zenml.utils.time_utils import utc_now logger = get_logger(__name__) @@ -152,12 +156,17 @@ class LogEntry(BaseModel): class LoggingContext: """Context manager which collects logs using a LogStore.""" - def __init__(self, source: str, log_model: Optional[Union["LogsRequest", "LogsResponse"]] = None) -> None: + def __init__( + self, + source: str, + log_model: Optional[Union["LogsRequest", "LogsResponse"]] = None, + ) -> None: """Initialize the logging context. Args: source: An identifier for the source of the logs (e.g., "step", "orchestrator") + log_model: The log model to use for the logging context """ if Client().active_stack.log_store: self.log_store = Client().active_stack.log_store @@ -314,6 +323,7 @@ def setup_orchestrator_logging( ) return nullcontext() + # TODO: Double check this function @contextmanager def setup_pipeline_logging( @@ -351,20 +361,20 @@ def setup_pipeline_logging( logs_model = logs_response logs_context = LoggingContext(source="client", log_model=logs_model) - + if run_id and logs_response is None: try: - run_update = PipelineRunUpdate(add_logs=[logs_context.log_model]) + run_update = PipelineRunUpdate( + add_logs=[logs_context.log_model] + ) client.zen_store.update_run( run_id=run_id, run_update=run_update ) except Exception as e: - logger.error( - f"Failed to add logs to the run {run_id}: {e}" - ) + logger.error(f"Failed to add logs to the run {run_id}: {e}") raise e with logs_context: yield logs_context.log_model else: - yield None \ No newline at end of file + yield None diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/step_logging.py index 97fa94600d9..52c27aeafec 100644 --- a/src/zenml/logging/step_logging.py +++ b/src/zenml/logging/step_logging.py @@ -20,13 +20,12 @@ import re import threading import time -from contextlib import contextmanager, nullcontext +from contextlib import nullcontext from contextvars import ContextVar from datetime import datetime from types import TracebackType from typing import ( Any, - Generator, Iterator, List, Optional, @@ -931,6 +930,3 @@ def setup_orchestrator_logging( f"Failed to setup orchestrator logging for run {run_id}: {e}" ) return nullcontext() - - - diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index a387d24a5c8..8b7a996ff72 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -275,7 +275,7 @@ def launch(self) -> StepRunResponse: if step_logging_enabled: logs_context = step_logging.LoggingContext(source="step") - logs_model = logs_context.log_request + logs_model = logs_context.log_model with logs_context: if run_was_created: diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index 4ac20ef897c..99c4531244c 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -1048,7 +1048,7 @@ def _run( if logging_enabled: logs_context = LoggingContext("client") - logs_request = logs_context.log_request + logs_request = logs_context.log_model with logs_context: snapshot = self._create_snapshot(**self._run_args) diff --git a/src/zenml/zen_server/routers/steps_endpoints.py b/src/zenml/zen_server/routers/steps_endpoints.py index 32f35c526fd..7b500020d05 100644 --- a/src/zenml/zen_server/routers/steps_endpoints.py +++ b/src/zenml/zen_server/routers/steps_endpoints.py @@ -28,9 +28,9 @@ VERSION_1, ) from zenml.enums import ExecutionStatus +from zenml.exceptions import AuthorizationException from zenml.log_stores import fetch_logs from zenml.logging.logging import MAX_ENTRIES_PER_REQUEST, LogEntry -from zenml.exceptions import AuthorizationException from zenml.models import ( Page, StepRunFilter, diff --git a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py index b83e4052aea..da0ed866375 100644 --- a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py +++ b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py @@ -1,7 +1,7 @@ """add log stores [5c0a1c787128]. Revision ID: 5c0a1c787128 -Revises: 124b57b8c7b1 +Revises: a5a17015b681 Create Date: 2025-10-24 10:06:54.402219 """ @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "5c0a1c787128" -down_revision = "0.91.0" +down_revision = "a5a17015b681" branch_labels = None depends_on = None From fddf02b26c7bc9307c033d0231b1b0b17e6cf89d Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 9 Nov 2025 23:00:49 +0100 Subject: [PATCH 17/81] fix the local orch --- .../orchestrators/local/local_orchestrator.py | 183 ++++++++---------- 1 file changed, 85 insertions(+), 98 deletions(-) diff --git a/src/zenml/orchestrators/local/local_orchestrator.py b/src/zenml/orchestrators/local/local_orchestrator.py index a15f22ce9b3..ad28737317f 100644 --- a/src/zenml/orchestrators/local/local_orchestrator.py +++ b/src/zenml/orchestrators/local/local_orchestrator.py @@ -19,7 +19,6 @@ from zenml.enums import ExecutionMode from zenml.logger import get_logger -from zenml.logging.logging import setup_orchestrator_logging from zenml.orchestrators import ( BaseOrchestrator, BaseOrchestratorConfig, @@ -98,106 +97,94 @@ def submit_pipeline( RuntimeError: If the pipeline run fails. """ self._orchestrator_run_id = str(uuid4()) + start_time = time.time() - # Setup orchestrator logging context (if enabled) - logs_context = setup_orchestrator_logging( - run_id=placeholder_run.id - if placeholder_run - else self._orchestrator_run_id, # type: ignore[arg-type] - snapshot=snapshot, - ) - - with logs_context: - start_time = time.time() - - execution_mode = snapshot.pipeline_configuration.execution_mode - - failed_steps: List[str] = [] - step_exception: Optional[Exception] = None - skipped_steps: List[str] = [] - - self.run_init_hook(snapshot=snapshot) - - # Run each step - for step_name, step in snapshot.step_configurations.items(): - if ( - execution_mode == ExecutionMode.STOP_ON_FAILURE - and failed_steps - ): - logger.warning( - "Skipping step %s due to the failed step(s): %s (Execution mode %s)", - step_name, - ", ".join(failed_steps), - execution_mode, - ) - skipped_steps.append(step_name) - continue - - if failed_upstream_steps := [ - fs for fs in failed_steps if fs in step.spec.upstream_steps - ]: - logger.warning( - "Skipping step %s due to failure in upstream step(s): %s (Execution mode %s)", - step_name, - ", ".join(failed_upstream_steps), - execution_mode, - ) - skipped_steps.append(step_name) - continue - - if skipped_upstream_steps := [ - fs - for fs in skipped_steps - if fs in step.spec.upstream_steps - ]: - logger.warning( - "Skipping step %s due to the skipped upstream step(s) %s (Execution mode %s)", - step_name, - ", ".join(skipped_upstream_steps), - execution_mode, - ) - skipped_steps.append(step_name) - continue - - if self.requires_resources_in_orchestration_environment(step): - logger.warning( - "Specifying step resources is not supported for the local " - "orchestrator, ignoring resource configuration for " - "step %s.", - step_name, - ) - - step_environment = step_environments[step_name] - try: - with temporary_environment(step_environment): - self.run_step(step=step) - except Exception as e: - failed_steps.append(step_name) - logger.exception("Step %s failed.", step_name) - - if execution_mode == ExecutionMode.FAIL_FAST: - step_exception = e - break - - self.run_cleanup_hook(snapshot=snapshot) - - if execution_mode == ExecutionMode.FAIL_FAST and failed_steps: - assert step_exception is not None - raise step_exception - - if failed_steps: - raise RuntimeError( - "Pipeline run has failed due to failure in step(s): " - f"{', '.join(failed_steps)}" + execution_mode = snapshot.pipeline_configuration.execution_mode + + failed_steps: List[str] = [] + step_exception: Optional[Exception] = None + skipped_steps: List[str] = [] + + self.run_init_hook(snapshot=snapshot) + + # Run each step + for step_name, step in snapshot.step_configurations.items(): + if ( + execution_mode == ExecutionMode.STOP_ON_FAILURE + and failed_steps + ): + logger.warning( + "Skipping step %s due to the failed step(s): %s (Execution mode %s)", + step_name, + ", ".join(failed_steps), + execution_mode, + ) + skipped_steps.append(step_name) + continue + + if failed_upstream_steps := [ + fs for fs in failed_steps if fs in step.spec.upstream_steps + ]: + logger.warning( + "Skipping step %s due to failure in upstream step(s): %s (Execution mode %s)", + step_name, + ", ".join(failed_upstream_steps), + execution_mode, + ) + skipped_steps.append(step_name) + continue + + if skipped_upstream_steps := [ + fs for fs in skipped_steps if fs in step.spec.upstream_steps + ]: + logger.warning( + "Skipping step %s due to the skipped upstream step(s) %s (Execution mode %s)", + step_name, + ", ".join(skipped_upstream_steps), + execution_mode, ) + skipped_steps.append(step_name) + continue + + if self.requires_resources_in_orchestration_environment(step): + logger.warning( + "Specifying step resources is not supported for the local " + "orchestrator, ignoring resource configuration for " + "step %s.", + step_name, + ) + + step_environment = step_environments[step_name] + try: + with temporary_environment(step_environment): + self.run_step(step=step) + except Exception as e: + failed_steps.append(step_name) + logger.exception("Step %s failed.", step_name) + + if execution_mode == ExecutionMode.FAIL_FAST: + step_exception = e + break + + self.run_cleanup_hook(snapshot=snapshot) - run_duration = time.time() - start_time - logger.info( - "Pipeline run has finished in `%s`.", - string_utils.get_human_readable_time(run_duration), + if execution_mode == ExecutionMode.FAIL_FAST and failed_steps: + assert step_exception is not None + raise step_exception + + if failed_steps: + raise RuntimeError( + "Pipeline run has failed due to failure in step(s): " + f"{', '.join(failed_steps)}" ) - self._orchestrator_run_id = None - return None + + run_duration = time.time() - start_time + logger.info( + "Pipeline run has finished in `%s`.", + string_utils.get_human_readable_time(run_duration), + ) + self._orchestrator_run_id = None + return None def submit_dynamic_pipeline( self, @@ -343,4 +330,4 @@ def implementation_class(self) -> Type[LocalOrchestrator]: Returns: The implementation class for this flavor. """ - return LocalOrchestrator + return LocalOrchestrator \ No newline at end of file From 9f50c734ff01ee9617cf63b78612400d22637686 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 11 Nov 2025 10:19:10 +0100 Subject: [PATCH 18/81] removed unused --- src/zenml/log_stores/otel/otel_provider.py | 297 --------------------- 1 file changed, 297 deletions(-) delete mode 100644 src/zenml/log_stores/otel/otel_provider.py diff --git a/src/zenml/log_stores/otel/otel_provider.py b/src/zenml/log_stores/otel/otel_provider.py deleted file mode 100644 index 7550a821403..00000000000 --- a/src/zenml/log_stores/otel/otel_provider.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright (c) ZenML GmbH 2025. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing -# permissions and limitations under the License. -"""OpenTelemetry logging infrastructure for ZenML.""" - -import atexit -import concurrent.futures -import threading -import time -from typing import TYPE_CHECKING, Dict, Optional, Sequence - -from opentelemetry.sdk._logs.export import ( - BatchLogRecordProcessor, - LogExporter, - LogExportResult, -) - -if TYPE_CHECKING: - from opentelemetry.sdk._logs import LogData - -from zenml.logger import get_logger - -logger = get_logger(__name__) - -# Global shared infrastructure (singleton per process) -_shared_processor: Optional[BatchLogRecordProcessor] = None -_routing_exporter: Optional["RoutingLogExporter"] = None -_infrastructure_lock = threading.Lock() - - -class RoutingLogExporter(LogExporter): - """Routes logs to different exporters based on log_id. - - Processes exports in parallel using a thread pool for better performance - when multiple log stores are active. - """ - - def __init__(self, max_concurrent_exporters: int = 10): - """Initialize the routing exporter with thread pool. - - Args: - max_concurrent_exporters: Maximum number of exporters to run in parallel. - """ - self._exporters: Dict[str, LogExporter] = {} - self._lock = threading.Lock() - self._executor = concurrent.futures.ThreadPoolExecutor( - max_workers=max_concurrent_exporters, - thread_name_prefix="zenml-log-export", - ) - self._export_count = 0 - self._slow_export_count = 0 - - def register_exporter(self, log_id: str, exporter: LogExporter) -> None: - """Register an exporter for a specific log_id. - - Args: - log_id: Unique identifier for the log store. - exporter: The exporter to handle logs for this log_id. - """ - with self._lock: - self._exporters[log_id] = exporter - - def unregister_exporter(self, log_id: str) -> None: - """Unregister an exporter for a specific log_id. - - Also calls shutdown() on the exporter to cleanup resources - and prevent memory leaks. - - Args: - log_id: The log_id to unregister. - """ - with self._lock: - exporter = self._exporters.pop(log_id, None) - if exporter: - try: - exporter.shutdown() - except Exception as e: - logger.warning( - f"Error shutting down exporter for {log_id}: {e}" - ) - - def export(self, batch: Sequence["LogData"]) -> LogExportResult: - """Route logs to appropriate exporters based on log_id. - - Logs are grouped by log_id from the Resource attributes, then - exported in parallel using the thread pool. - - Args: - batch: Sequence of LogData to export. - - Returns: - LogExportResult indicating success or failure. - """ - if not batch: - return LogExportResult.SUCCESS - - self._export_count += 1 - start_time = time.time() - - # Group logs by log_id - logs_by_id: Dict[str, list] = {} - - for log_data in batch: - # Extract log_id from Resource attributes - log_id = None - if log_data.log_record.resource: - attrs = dict(log_data.log_record.resource.attributes) - log_id = attrs.get("zenml.log_id") - - if log_id: - logs_by_id.setdefault(log_id, []).append(log_data) - else: - logger.debug("Received log without zenml.log_id") - - # Submit all exports to thread pool in parallel - futures = [] - with self._lock: - for log_id, logs in logs_by_id.items(): - exporter = self._exporters.get(log_id) - if exporter: - future = self._executor.submit( - self._safe_export, exporter, logs, log_id - ) - futures.append(future) - - # Wait for all exports to complete - all_success = True - timeout = 30 # seconds total for all exports - - try: - for future in concurrent.futures.as_completed( - futures, timeout=timeout - ): - try: - result = future.result(timeout=1) - if result != LogExportResult.SUCCESS: - all_success = False - except concurrent.futures.TimeoutError: - logger.error("Export timeout waiting for result") - all_success = False - except Exception as e: - logger.error(f"Export failed: {e}") - all_success = False - except concurrent.futures.TimeoutError: - logger.error(f"Exports took longer than {timeout}s timeout") - all_success = False - - # Monitor performance - duration = time.time() - start_time - if duration > 1.5: # Slower than batch interval - self._slow_export_count += 1 - if self._slow_export_count % 10 == 0: - logger.warning( - f"Slow exports detected: {duration:.2f}s " - f"(total slow: {self._slow_export_count}/{self._export_count})" - ) - - return ( - LogExportResult.SUCCESS if all_success else LogExportResult.FAILURE - ) - - def _safe_export( - self, exporter: LogExporter, logs: Sequence["LogData"], log_id: str - ) -> LogExportResult: - """Safely export logs with error handling. - - Args: - exporter: The exporter to use. - logs: Logs to export. - log_id: ID for logging purposes. - - Returns: - Export result. - """ - try: - return exporter.export(logs) - except Exception as e: - logger.error(f"Export failed for log_id {log_id}: {e}") - return LogExportResult.FAILURE - - def shutdown(self) -> None: - """Shutdown the routing exporter and thread pool.""" - self._executor.shutdown(wait=True) - - def force_flush(self, timeout_millis: int = 30000) -> bool: - """Force flush any buffered logs. - - Args: - timeout_millis: Timeout in milliseconds. - - Returns: - True if successful. - """ - futures = [] - with self._lock: - for exporter in self._exporters.values(): - future = self._executor.submit( - self._safe_flush, exporter, timeout_millis - ) - futures.append(future) - - all_success = True - timeout_sec = timeout_millis / 1000.0 - try: - for future in concurrent.futures.as_completed( - futures, timeout=timeout_sec - ): - try: - if not future.result(timeout=1): - all_success = False - except Exception as e: - logger.warning(f"Force flush failed: {e}") - all_success = False - except concurrent.futures.TimeoutError: - logger.warning("Force flush timeout") - all_success = False - - return all_success - - def _safe_flush(self, exporter: LogExporter, timeout_millis: int) -> bool: - """Safely flush an exporter with error handling. - - Args: - exporter: The exporter to flush. - timeout_millis: Timeout in milliseconds. - - Returns: - True if successful. - """ - try: - return exporter.force_flush(timeout_millis) - except Exception as e: - logger.warning(f"Flush failed: {e}") - return False - - -def get_shared_otel_infrastructure() -> tuple[ - BatchLogRecordProcessor, RoutingLogExporter -]: - """Get or create shared OpenTelemetry logging infrastructure. - - Creates a single BatchLogRecordProcessor with RoutingLogExporter that - all log stores share. Each log store creates its own LoggerProvider - with a unique resource. - - Returns: - Tuple of (shared BatchLogRecordProcessor, RoutingLogExporter). - """ - global _shared_processor, _routing_exporter - - if _shared_processor is None: - with _infrastructure_lock: - _routing_exporter = RoutingLogExporter() - _shared_processor = BatchLogRecordProcessor( - _routing_exporter, - max_queue_size=4096, - schedule_delay_millis=1000, - max_export_batch_size=512, - ) - atexit.register(shutdown_shared_infrastructure) - - return _shared_processor, _routing_exporter - - -def shutdown_shared_infrastructure() -> None: - """Shutdown the shared OpenTelemetry infrastructure. - - This is called on process exit via atexit. It shuts down the shared - processor (which stops the background thread) and the routing exporter. - """ - global _shared_processor, _routing_exporter - - if _shared_processor: - try: - _shared_processor.force_flush() - _shared_processor.shutdown() - except Exception as e: - logger.warning(f"Error during processor shutdown: {e}") - - if _routing_exporter: - try: - _routing_exporter.shutdown() - except Exception as e: - logger.warning(f"Error shutting down routing exporter: {e}") - - _shared_processor = None - _routing_exporter = None From 50d6833999971131912d2074e474202bc658e14f Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 11 Nov 2025 10:21:26 +0100 Subject: [PATCH 19/81] delete the old step logging --- src/zenml/logging/step_logging.py | 932 ------------------------------ 1 file changed, 932 deletions(-) delete mode 100644 src/zenml/logging/step_logging.py diff --git a/src/zenml/logging/step_logging.py b/src/zenml/logging/step_logging.py deleted file mode 100644 index 52c27aeafec..00000000000 --- a/src/zenml/logging/step_logging.py +++ /dev/null @@ -1,932 +0,0 @@ -# Copyright (c) ZenML GmbH 2023. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing -# permissions and limitations under the License. -"""ZenML logging handler.""" - -import asyncio -import logging -import os -import queue -import re -import threading -import time -from contextlib import nullcontext -from contextvars import ContextVar -from datetime import datetime -from types import TracebackType -from typing import ( - Any, - Iterator, - List, - Optional, - Type, - Union, -) -from uuid import UUID, uuid4 - -from pydantic import BaseModel, Field - -from zenml.artifact_stores import BaseArtifactStore -from zenml.artifacts.utils import _load_artifact_store -from zenml.client import Client -from zenml.constants import ( - ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, - ENV_ZENML_DISABLE_STEP_NAMES_IN_LOGS, - LOGS_MERGE_INTERVAL_SECONDS, - LOGS_STORAGE_MAX_QUEUE_SIZE, - LOGS_STORAGE_QUEUE_TIMEOUT, - LOGS_WRITE_INTERVAL_SECONDS, - handle_bool_env_var, -) -from zenml.enums import LoggingLevels -from zenml.exceptions import DoesNotExistException -from zenml.logger import ( - get_logger, - get_storage_log_level, - logging_handlers, - step_names_in_console, -) -from zenml.models import ( - LogsRequest, - LogsResponse, - PipelineRunUpdate, - PipelineSnapshotResponse, -) -from zenml.utils.io_utils import sanitize_remote_path -from zenml.utils.time_utils import utc_now -from zenml.zen_stores.base_zen_store import BaseZenStore - -logger = get_logger(__name__) - -# Context variables -redirected: ContextVar[bool] = ContextVar("redirected", default=False) - -ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") - -LOGS_EXTENSION = ".log" -PIPELINE_RUN_LOGS_FOLDER = "pipeline_runs" - -# Maximum number of log entries to return in a single request -MAX_ENTRIES_PER_REQUEST = 20000 -# Maximum size of a single log message in bytes (5KB) -DEFAULT_MESSAGE_SIZE = 5 * 1024 - - -class LogEntry(BaseModel): - """A structured log entry with parsed information.""" - - message: str = Field(description="The log message content") - name: Optional[str] = Field( - default=None, - description="The name of the logger", - ) - level: Optional[LoggingLevels] = Field( - default=None, - description="The log level", - ) - timestamp: Optional[datetime] = Field( - default=None, - description="When the log was created", - ) - module: Optional[str] = Field( - default=None, description="The module that generated this log entry" - ) - filename: Optional[str] = Field( - default=None, - description="The name of the file that generated this log entry", - ) - lineno: Optional[int] = Field( - default=None, description="The fileno that generated this log entry" - ) - chunk_index: int = Field( - default=0, - description="The index of the chunk in the log entry", - ) - total_chunks: int = Field( - default=1, - description="The total number of chunks in the log entry", - ) - id: UUID = Field( - default_factory=uuid4, - description="The unique identifier of the log entry", - ) - - -class ArtifactStoreHandler(logging.Handler): - """Handler that writes log messages to artifact store storage.""" - - def __init__(self, storage: "PipelineLogsStorage"): - """Initialize the handler with a storage instance. - - Args: - storage: The PipelineLogsStorage instance to write to. - """ - super().__init__() - self.storage = storage - - # Get storage log level from environment - self.setLevel(get_storage_log_level().value) - - def emit(self, record: logging.LogRecord) -> None: - """Emit a log record to the storage. - - Args: - record: The log record to emit. - """ - try: - # Get level enum - level = LoggingLevels.__members__.get(record.levelname.upper()) - - # Get the message - message = self.format(record) - message = remove_ansi_escape_codes(message).rstrip() - - # Check if message needs to be chunked - message_bytes = message.encode("utf-8") - if len(message_bytes) <= DEFAULT_MESSAGE_SIZE: - # Message is small enough, emit as-is - log_record = LogEntry.model_construct( - message=message, - name=record.name, - level=level, - timestamp=utc_now(tz_aware=True), - module=record.module, - filename=record.filename, - lineno=record.lineno, - ) - json_line = log_record.model_dump_json(exclude_none=True) - self.storage.write(json_line) - else: - # Message is too large, split into chunks and emit each one - chunks = self._split_to_chunks(message) - entry_id = uuid4() - for i, chunk in enumerate(chunks): - log_record = LogEntry.model_construct( - message=chunk, - name=record.name, - level=level, - module=record.module, - filename=record.filename, - lineno=record.lineno, - timestamp=utc_now(tz_aware=True), - chunk_index=i, - total_chunks=len(chunks), - id=entry_id, - ) - - json_line = log_record.model_dump_json(exclude_none=True) - self.storage.write(json_line) - except Exception: - pass - - def _split_to_chunks(self, message: str) -> List[str]: - """Split a large message into chunks. - - Args: - message: The message to split. - - Returns: - A list of message chunks. - """ - # Calculate how many chunks we need - message_bytes = message.encode("utf-8") - - # Split the message into chunks, handling UTF-8 boundaries - chunks = [] - start = 0 - - while start < len(message_bytes): - # Calculate the end position for this chunk - end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) - - # Try to decode the chunk, backing up if we hit a UTF-8 boundary issue - while end > start: - chunk_bytes = message_bytes[start:end] - try: - chunk_text = chunk_bytes.decode("utf-8") - chunks.append(chunk_text) - break - except UnicodeDecodeError: - # If we can't decode, try a smaller chunk - end -= 1 - else: - # If we can't decode anything, use replacement characters - end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) - chunks.append( - message_bytes[start:end].decode("utf-8", errors="replace") - ) - - start = end - - return chunks - - -def remove_ansi_escape_codes(text: str) -> str: - """Auxiliary function to remove ANSI escape codes from a given string. - - Args: - text: the input string - - Returns: - the version of the input string where the escape codes are removed. - """ - return ansi_escape.sub("", text) - - -def parse_log_entry(log_line: str) -> Optional[LogEntry]: - """Parse a single log entry into a LogEntry object. - - Handles two formats: - 1. JSON format: {"timestamp": "...", "level": "...", "message": "...", "location": "..."} - Uses Pydantic's model_validate_json for automatic parsing and validation. - 2. Plain text: Any other text (defaults to INFO level) - - Args: - log_line: A single log line to parse - - Returns: - LogEntry object. For JSON logs, all fields are validated and parsed automatically. - For plain text logs, only message is populated with INFO level default. - Returns None only for empty lines. - """ - line = log_line.strip() - if not line: - return None - - if line.startswith("{") and line.endswith("}"): - try: - return LogEntry.model_validate_json(line) - except Exception: - pass - - old_format = re.search( - r"^\[(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+UTC\]", line - ) - - timestamp = None - if old_format: - timestamp = old_format.group(1) + "Z" - line = line.replace(old_format.group(0), "").strip() - - return LogEntry( - message=line, - name=None, - level=LoggingLevels.INFO, - timestamp=timestamp, - ) - - -def prepare_logs_uri( - artifact_store: "BaseArtifactStore", - step_name: Optional[str] = None, - log_key: Optional[str] = None, -) -> str: - """Generates and prepares a URI for the log file or folder for a step. - - Args: - artifact_store: The artifact store on which the artifact will be stored. - step_name: Name of the step. Skipped for global pipeline run logs. - log_key: The unique identification key of the log file. - - Returns: - The URI of the log storage (file or folder). - """ - if log_key is None: - log_key = str(uuid4()) - - subfolder = step_name or PIPELINE_RUN_LOGS_FOLDER - logs_base_uri = os.path.join(artifact_store.path, subfolder, "logs") - - # Create the dir - if not artifact_store.exists(logs_base_uri): - artifact_store.makedirs(logs_base_uri) - - # Delete the file if it already exists - if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - logs_uri = os.path.join(logs_base_uri, log_key) - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs directory {logs_uri} already exists! Removing old log directory..." - ) - artifact_store.rmtree(logs_uri) - - artifact_store.makedirs(logs_uri) - else: - logs_uri = os.path.join(logs_base_uri, f"{log_key}{LOGS_EXTENSION}") - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs file {logs_uri} already exists! Removing old log file..." - ) - artifact_store.remove(logs_uri) - - return sanitize_remote_path(logs_uri) - - -def fetch_log_records( - zen_store: "BaseZenStore", - artifact_store_id: Union[str, UUID], - logs_uri: str, -) -> List[LogEntry]: - """Fetches log entries. - - Args: - zen_store: The store in which the artifact is stored. - artifact_store_id: The ID of the artifact store. - logs_uri: The URI of the artifact (file or directory). - - Returns: - List of log entries. - """ - log_entries = [] - - for line in _stream_logs_line_by_line( - zen_store, artifact_store_id, logs_uri - ): - if log_entry := parse_log_entry(line): - log_entries.append(log_entry) - - if len(log_entries) >= MAX_ENTRIES_PER_REQUEST: - break - - return log_entries - - -def _stream_logs_line_by_line( - zen_store: "BaseZenStore", - artifact_store_id: Union[str, UUID], - logs_uri: str, -) -> Iterator[str]: - """Stream logs line by line without loading the entire file into memory. - - This generator yields log lines one by one, handling both single files - and directories with multiple log files. - - Args: - zen_store: The store in which the artifact is stored. - artifact_store_id: The ID of the artifact store. - logs_uri: The URI of the log file or directory. - - Yields: - Individual log lines as strings. - - Raises: - DoesNotExistException: If the artifact does not exist in the artifact store. - """ - artifact_store = _load_artifact_store(artifact_store_id, zen_store) - - try: - if not artifact_store.isdir(logs_uri): - # Single file case - with artifact_store.open(logs_uri, "r") as file: - for line in file: - yield line.rstrip("\n\r") - else: - # Directory case - may contain multiple log files - files = artifact_store.listdir(logs_uri) - if not files: - raise DoesNotExistException( - f"Folder '{logs_uri}' is empty in artifact store " - f"'{artifact_store.name}'." - ) - - # Sort files to read them in order - files.sort() - - for file in files: - file_path = os.path.join(logs_uri, str(file)) - with artifact_store.open(file_path, "r") as f: - for line in f: - yield line.rstrip("\n\r") - finally: - artifact_store.cleanup() - - -class PipelineLogsStorage: - """Helper class which buffers and stores logs to a given URI using a background thread.""" - - def __init__( - self, - logs_uri: str, - artifact_store: "BaseArtifactStore", - max_queue_size: int = LOGS_STORAGE_MAX_QUEUE_SIZE, - queue_timeout: int = LOGS_STORAGE_QUEUE_TIMEOUT, - write_interval: int = LOGS_WRITE_INTERVAL_SECONDS, - merge_files_interval: int = LOGS_MERGE_INTERVAL_SECONDS, - ) -> None: - """Initialization. - - Args: - logs_uri: the URI of the log file or folder. - artifact_store: Artifact Store from the current step context - max_queue_size: maximum number of individual messages to queue. - queue_timeout: timeout in seconds for putting items in queue when full. - - Positive value: Wait N seconds, then drop logs if queue still full - - Negative value: Block indefinitely until queue has space (never drop logs) - write_interval: the amount of seconds before the created files - get written to the artifact store. - merge_files_interval: the amount of seconds before the created files - get merged into a single file. - """ - # Parameters - self.logs_uri = logs_uri - self.max_queue_size = max_queue_size - self.queue_timeout = queue_timeout - self.write_interval = write_interval - self.merge_files_interval = merge_files_interval - - # State - self.artifact_store = artifact_store - - # Immutable filesystems state - self.last_merge_time = time.time() - - # Queue and log storage thread for async processing - self.log_queue: queue.Queue[str] = queue.Queue(maxsize=max_queue_size) - self.log_storage_thread: Optional[threading.Thread] = None - self.shutdown_event = threading.Event() - self.merge_event = threading.Event() - - # Start the log storage thread - self._start_log_storage_thread() - - def _start_log_storage_thread(self) -> None: - """Start the log storage thread for processing log queue.""" - if ( - self.log_storage_thread is None - or not self.log_storage_thread.is_alive() - ): - self.log_storage_thread = threading.Thread( - target=self._log_storage_worker, - name="LogsStorage-Worker", - ) - self.log_storage_thread.start() - - def _process_log_queue(self, force_merge: bool = False) -> None: - """Write and merge logs to the artifact store using time-based batching. - - Args: - force_merge: Whether to force merge the logs. - """ - try: - messages = [] - - # Get first message (blocking with timeout) - try: - first_message = self.log_queue.get(timeout=1) - messages.append(first_message) - except queue.Empty: - return - - # Get any remaining messages without waiting (drain quickly) - while True: - try: - additional_message = self.log_queue.get_nowait() - messages.append(additional_message) - except queue.Empty: - break - - # Write the messages to the artifact store - if messages: - self.write_buffer(messages) - - # Merge the log files if needed - if ( - self._is_merge_needed - or self.merge_event.is_set() - or force_merge - ): - self.merge_event.clear() - - self.merge_log_files(merge_all_files=force_merge) - - except Exception as e: - logger.error("Error in log storage thread: %s", e) - finally: - for _ in messages: - self.log_queue.task_done() - - # Wait for the next write interval or until shutdown is requested - self.shutdown_event.wait(timeout=self.write_interval) - - def _log_storage_worker(self) -> None: - """Log storage thread worker that processes the log queue.""" - # Process the log queue until shutdown is requested - while not self.shutdown_event.is_set(): - self._process_log_queue() - - # Shutdown requested - drain remaining queue items and merge log files - self._process_log_queue(force_merge=True) - - def _shutdown_log_storage_thread(self, timeout: int = 5) -> None: - """Shutdown the log storage thread gracefully. - - Args: - timeout: Maximum time to wait for thread shutdown. - """ - if self.log_storage_thread and self.log_storage_thread.is_alive(): - # Then signal the worker to begin graceful shutdown - self.shutdown_event.set() - - # Wait for thread to finish (it will drain the queue automatically) - self.log_storage_thread.join(timeout=timeout) - - def write(self, text: str) -> None: - """Main write method that sends individual messages directly to queue. - - Args: - text: the incoming string. - """ - # Skip empty lines - if text == "\n": - return - - # If the current thread is the log storage thread, do nothing - # to prevent recursion when the storage thread itself generates logs - if ( - self.log_storage_thread - and threading.current_thread() == self.log_storage_thread - ): - return - - # If the current thread is the fsspec IO thread, do nothing - if self._is_fsspec_io_thread: - return - - try: - # Send individual message directly to queue - if not self.shutdown_event.is_set(): - try: - if self.queue_timeout < 0: - # Negative timeout = block indefinitely until queue has space - # Guarantees no log loss but may hang application - self.log_queue.put(text) - else: - # Positive timeout = wait specified time then drop logs - # Prevents application hanging but may lose logs - self.log_queue.put(text, timeout=self.queue_timeout) - except queue.Full: - # This only happens with positive timeout - # Queue is full - just skip this message to avoid blocking - # Better to drop logs than hang the application - pass - - except Exception: - # Silently ignore errors to prevent recursion - pass - - @property - def _is_merge_needed(self) -> bool: - """Checks whether the log files need to be merged. - - Returns: - whether the log files need to be merged. - """ - return ( - self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM - and time.time() - self.last_merge_time > self.merge_files_interval - ) - - @property - def _is_fsspec_io_thread(self) -> bool: - """Checks if the current thread is the fsspec IO thread. - - Returns: - whether the current thread is the fsspec IO thread. - """ - # Most artifact stores are based on fsspec, which converts between - # sync and async operations by using a separate AIO thread. - # It may happen that the fsspec call itself will log something, - # which will trigger this method, which may then use fsspec again, - # causing a "Calling sync() from within a running loop" error, because - # the fsspec library does not expect sync calls being made as a result - # of a logging call made by itself. - # To avoid this, we simply check if we're running in the fsspec AIO - # thread and skip the save if that's the case. - try: - return ( - asyncio.events.get_running_loop() is not None - and threading.current_thread().name == "fsspecIO" - ) - except RuntimeError: - # No running loop - return False - - def _get_timestamped_filename(self, suffix: str = "") -> str: - """Returns a timestamped filename. - - Args: - suffix: optional suffix for the file name - - Returns: - The timestamped filename. - """ - return f"{time.time()}{suffix}{LOGS_EXTENSION}" - - def write_buffer(self, buffer_to_write: List[str]) -> None: - """Write the given buffer to file. This runs in the log storage thread. - - Args: - buffer_to_write: The buffer contents to write to file. - """ - if not buffer_to_write: - return - - try: - # If the artifact store is immutable, write the buffer to a new file - if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - _logs_uri = self._get_timestamped_filename() - with self.artifact_store.open( - os.path.join( - self.logs_uri, - _logs_uri, - ), - "w", - ) as file: - for message in buffer_to_write: - file.write(f"{message}\n") - - # If the artifact store is mutable, append the buffer to the existing file - else: - with self.artifact_store.open(self.logs_uri, "a") as file: - for message in buffer_to_write: - file.write(f"{message}\n") - self.artifact_store._remove_previous_file_versions( - self.logs_uri - ) - - except Exception as e: - logger.error("Error in log storage thread: %s", e) - - def merge_log_files(self, merge_all_files: bool = False) -> None: - """Merges all log files into one in the given URI. - - Called on the logging context exit. - - Args: - merge_all_files: whether to merge all files or only raw files - """ - from zenml.artifacts.utils import ( - _load_file_from_artifact_store, - ) - - # If the artifact store is immutable, merge the log files - if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - merged_file_suffix = "_merged" - files_ = self.artifact_store.listdir(self.logs_uri) - if not merge_all_files: - # already merged files will not be merged again - files_ = [ - f for f in files_ if merged_file_suffix not in str(f) - ] - file_name_ = self._get_timestamped_filename( - suffix=merged_file_suffix - ) - if len(files_) > 1: - files_.sort() - logger.debug("Log files count: %s", len(files_)) - - missing_files = set() - # dump all logs to a local file first - with self.artifact_store.open( - os.path.join(self.logs_uri, file_name_), "w" - ) as merged_file: - for file in files_: - try: - merged_file.write( - str( - _load_file_from_artifact_store( - os.path.join(self.logs_uri, str(file)), - artifact_store=self.artifact_store, - mode="r", - ) - ) - ) - except DoesNotExistException: - missing_files.add(file) - - # clean up left over files - for file in files_: - if file not in missing_files: - self.artifact_store.remove( - os.path.join(self.logs_uri, str(file)) - ) - - # Update the last merge time - self.last_merge_time = time.time() - - def send_merge_event(self) -> None: - """Send a merge event to the log storage thread.""" - self.merge_event.set() - - -class PipelineLogsStorageContext: - """Context manager which collects logs during pipeline run execution.""" - - def __init__( - self, - logs_uri: str, - artifact_store: "BaseArtifactStore", - prepend_step_name: bool = True, - ) -> None: - """Initializes and prepares a storage object. - - Args: - logs_uri: the URI of the logs file. - artifact_store: Artifact Store from the current pipeline run context. - prepend_step_name: Whether to prepend the step name to the logs. - """ - # Create the storage object - self.storage = PipelineLogsStorage( - logs_uri=logs_uri, artifact_store=artifact_store - ) - - # Create the handler object - self.artifact_store_handler: ArtifactStoreHandler = ( - ArtifactStoreHandler(self.storage) - ) - - # Additional configuration - self.prepend_step_name = prepend_step_name - self.original_step_names_in_console: Optional[bool] = None - self._original_root_level: Optional[int] = None - - def __enter__(self) -> "PipelineLogsStorageContext": - """Enter condition of the context manager. - - Registers an ArtifactStoreHandler for log storage. - - Returns: - self - """ - # Add handler to root logger - root_logger = logging.getLogger() - root_logger.addHandler(self.artifact_store_handler) - - # Set root logger level to minimum of all active handlers - # This ensures records can reach any handler that needs them - self._original_root_level = root_logger.level - handler_levels = [handler.level for handler in root_logger.handlers] - - # Set root logger to the minimum level among all handlers - min_level = min(handler_levels) - if min_level < root_logger.level: - root_logger.setLevel(min_level) - - # Add handler to context variables for print() capture - logging_handlers.add(self.artifact_store_handler) - - # Save the current step names context variable state - self.original_step_names_in_console = step_names_in_console.get() - - # Set the step names context variable - step_names_disabled = handle_bool_env_var( - ENV_ZENML_DISABLE_STEP_NAMES_IN_LOGS, default=False - ) - - if step_names_disabled or not self.prepend_step_name: - # Step names are disabled through the env or they are disabled in the config - step_names_in_console.set(False) - else: - # Otherwise, set it True (default) - step_names_in_console.set(True) - - redirected.set(True) - return self - - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> None: - """Exit condition of the context manager. - - Args: - exc_type: The class of the exception - exc_val: The instance of the exception - exc_tb: The traceback of the exception - - Removes the handler from loggers and context variables. - """ - if exc_type is not None: - # Write the exception and its traceback to the logs - self.artifact_store_handler.emit( - logging.LogRecord( - name="exception", - level=logging.ERROR, - pathname="", - lineno=0, - msg="An exception has occurred.", - args=(), - exc_info=(exc_type, exc_val, exc_tb) if exc_val else None, - ) - ) - - # Remove handler from root logger and restore original level - root_logger = logging.getLogger() - - # Check if handler is still in the root logger before removing - if self.artifact_store_handler in root_logger.handlers: - root_logger.removeHandler(self.artifact_store_handler) - - # Restore original root logger level - if self._original_root_level is not None: - root_logger.setLevel(self._original_root_level) - - # Remove handler from context variables - logging_handlers.remove(self.artifact_store_handler) - - # Shutdown thread (it will automatically drain queue and merge files) - try: - self.storage._shutdown_log_storage_thread() - except Exception: - pass - - # Restore the original step names context variable state - if self.original_step_names_in_console is not None: - step_names_in_console.set(self.original_step_names_in_console) - - -def setup_orchestrator_logging( - run_id: UUID, - snapshot: "PipelineSnapshotResponse", - logs_response: Optional[LogsResponse] = None, -) -> Any: - """Set up logging for an orchestrator environment. - - This function can be reused by different orchestrators to set up - consistent logging behavior. - - Args: - run_id: The pipeline run ID. - snapshot: The snapshot of the pipeline run. - logs_response: The logs response to continue from. - - Returns: - The logs context (PipelineLogsStorageContext) - """ - try: - logging_enabled = True - - if handle_bool_env_var(ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, False): - logging_enabled = False - else: - if ( - snapshot.pipeline_configuration.enable_pipeline_logs - is not None - ): - logging_enabled = ( - snapshot.pipeline_configuration.enable_pipeline_logs - ) - - if not logging_enabled: - return nullcontext() - - # Fetch the active stack - client = Client() - active_stack = client.active_stack - - if logs_response: - logs_uri = logs_response.uri - else: - logs_uri = prepare_logs_uri( - artifact_store=active_stack.artifact_store, - ) - logs_model = LogsRequest( - uri=logs_uri, - source="orchestrator", - artifact_store_id=active_stack.artifact_store.id, - ) - - # Add orchestrator logs to the pipeline run - try: - run_update = PipelineRunUpdate(add_logs=[logs_model]) - client.zen_store.update_run( - run_id=run_id, run_update=run_update - ) - except Exception as e: - logger.error( - f"Failed to add orchestrator logs to the run {run_id}: {e}" - ) - raise e - - return PipelineLogsStorageContext( - logs_uri=logs_uri, - artifact_store=active_stack.artifact_store, - prepend_step_name=False, - ) - except Exception as e: - logger.error( - f"Failed to setup orchestrator logging for run {run_id}: {e}" - ) - return nullcontext() From 4866c89aa4d2692b592bdf5823edec70736d25a8 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 11 Nov 2025 11:00:36 +0100 Subject: [PATCH 20/81] some fixes --- src/zenml/logging/logging.py | 11 ++--- .../orchestrators/local/local_orchestrator.py | 2 +- src/zenml/orchestrators/step_launcher.py | 43 +++---------------- 3 files changed, 11 insertions(+), 45 deletions(-) diff --git a/src/zenml/logging/logging.py b/src/zenml/logging/logging.py index f124d22e66b..3fc87ae082f 100644 --- a/src/zenml/logging/logging.py +++ b/src/zenml/logging/logging.py @@ -193,12 +193,11 @@ def __init__( ) self.source = source - self.log_model = log_model or self.generate_log_request() + self.log_model = log_model self._previous_log_context: Optional[ Tuple["BaseLogStore", Union["LogsRequest", "LogsResponse"]] ] = None - self._is_outermost_context: bool = False def generate_log_request(self) -> "LogsRequest": """Create a log request model. @@ -206,6 +205,8 @@ def generate_log_request(self) -> "LogsRequest": Returns: The log request model. """ + if self.log_model is not None: + from zenml.log_stores.default.default_log_store import ( DefaultLogStore, prepare_logs_uri, @@ -356,11 +357,7 @@ def setup_pipeline_logging( if logging_enabled: client = Client() - logs_model = None - if logs_response: - logs_model = logs_response - - logs_context = LoggingContext(source="client", log_model=logs_model) + logs_context = LoggingContext(source="client", log_model=logs_response) if run_id and logs_response is None: try: diff --git a/src/zenml/orchestrators/local/local_orchestrator.py b/src/zenml/orchestrators/local/local_orchestrator.py index ad28737317f..361b4dc0b46 100644 --- a/src/zenml/orchestrators/local/local_orchestrator.py +++ b/src/zenml/orchestrators/local/local_orchestrator.py @@ -330,4 +330,4 @@ def implementation_class(self) -> Type[LocalOrchestrator]: Returns: The implementation class for this flavor. """ - return LocalOrchestrator \ No newline at end of file + return LocalOrchestrator diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 8b7a996ff72..e28419218af 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -30,7 +30,7 @@ from zenml.environment import get_run_environment_dict from zenml.exceptions import RunInterruptedException, RunStoppedException from zenml.logger import get_logger -from zenml.logging import logging as step_logging +from zenml.logging import logging as zenml_logging from zenml.models import ( PipelineRunRequest, PipelineRunResponse, @@ -41,6 +41,7 @@ from zenml.orchestrators import output_utils, publish_utils, step_run_utils from zenml.orchestrators import utils as orchestrator_utils from zenml.orchestrators.step_runner import StepRunner +from zenml.pipelines.build_utils import log_code_repository_usage from zenml.stack import Stack from zenml.steps import StepHeartBeatTerminationException, StepHeartbeatWorker from zenml.utils import env_utils, exception_utils, string_utils @@ -271,11 +272,10 @@ def launch(self) -> StepRunResponse: ) logs_context = nullcontext() - logs_model = None - + logs_request = None if step_logging_enabled: - logs_context = step_logging.LoggingContext(source="step") - logs_model = logs_context.log_model + logs_context = zenml_logging.LoggingContext(source="step") + logs_request = logs_context.log_model with logs_context: if run_was_created: @@ -301,7 +301,7 @@ def launch(self) -> StepRunResponse: invocation_id=self._invocation_id, dynamic_config=dynamic_config, ) - step_run_request.logs = logs_model + step_run_request.logs = logs_request try: request_factory.populate_request(request=step_run_request) @@ -327,37 +327,6 @@ def launch(self) -> StepRunResponse: logger.info(f"Step `{self._invocation_id}` has started.") try: - # here pass a forced save_to_file callable to be - # used as a dump function to use before starting - # the external jobs in step operators - if isinstance( - logs_context, - step_logging.LoggingContext, - ): - # For LoggingContext using DefaultLogStore, trigger merge - # TODO: investigate - from zenml.log_stores.default.default_log_store import ( - DefaultLogStore, - ) - - if isinstance( - logs_context.log_store, DefaultLogStore - ) and hasattr(logs_context.log_store, "storage"): - force_write_logs = ( - logs_context.log_store.storage.send_merge_event - ) - else: - - def _bypass() -> None: - return None - - force_write_logs = _bypass - else: - - def _bypass() -> None: - return None - - force_write_logs = _bypass self._run_step( pipeline_run=pipeline_run, step_run=step_run, From 01da27eb07814c1458ee6504f978f2bedf4be3a8 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Mon, 17 Nov 2025 21:00:02 +0100 Subject: [PATCH 21/81] running checkpoint --- .../execution/pipeline/dynamic/runner.py | 40 ++- .../kubernetes_orchestrator_entrypoint.py | 36 ++- src/zenml/log_stores/__init__.py | 39 +-- .../{default => artifact}/__init__.py | 3 +- .../artifact_log_exporter.py} | 163 +++++----- .../artifact_log_store.py} | 47 +-- .../artifact_log_store_flavor.py} | 60 ++-- src/zenml/log_stores/base_log_store.py | 46 +-- .../log_stores/datadog/datadog_log_store.py | 2 +- src/zenml/log_stores/otel/otel_log_store.py | 41 ++- src/zenml/log_stores/utils.py | 105 +++---- src/zenml/logger.py | 134 ++++---- src/zenml/logging/__init__.py | 15 + src/zenml/logging/logging.py | 296 +++++------------- src/zenml/orchestrators/step_launcher.py | 126 ++++---- src/zenml/orchestrators/step_runner.py | 2 +- src/zenml/pipelines/pipeline_definition.py | 25 +- src/zenml/stack/flavor_registry.py | 4 +- src/zenml/stack/stack.py | 32 +- .../zen_server/routers/runs_endpoints.py | 2 +- .../zen_server/routers/steps_endpoints.py | 3 +- .../functional/zen_stores/test_zen_store.py | 1 - 22 files changed, 545 insertions(+), 677 deletions(-) rename src/zenml/log_stores/{default => artifact}/__init__.py (93%) rename src/zenml/log_stores/{default/artifact_store_exporter.py => artifact/artifact_log_exporter.py} (62%) rename src/zenml/log_stores/{default/default_log_store.py => artifact/artifact_log_store.py} (87%) rename src/zenml/log_stores/{default/default_log_store_flavor.py => artifact/artifact_log_store_flavor.py} (55%) diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index 9d20565b3f9..a25106c55f8 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -16,6 +16,7 @@ import contextvars import inspect from concurrent.futures import ThreadPoolExecutor +from contextlib import nullcontext from typing import ( TYPE_CHECKING, Any, @@ -49,7 +50,7 @@ ) from zenml.execution.step.utils import launch_step from zenml.logger import get_logger -from zenml.logging.step_logging import setup_pipeline_logging +from zenml.logging.logging import setup_pipeline_logging from zenml.models import ( ArtifactVersionResponse, PipelineRunResponse, @@ -147,18 +148,37 @@ def pipeline(self) -> "DynamicPipeline": def run_pipeline(self) -> None: """Run the pipeline.""" - with setup_pipeline_logging( - source="orchestrator", - snapshot=self._snapshot, - run_id=self._run.id if self._run else None, - ) as logs_request: - with InMemoryArtifactCache(): - run = self._run or create_placeholder_run( + from zenml.logging.logging import generate_logs_request + + # Generate logs request for orchestrator logging + logs_request = generate_logs_request(source="orchestrator") + + with InMemoryArtifactCache(): + run = self._run or create_placeholder_run( + snapshot=self._snapshot, + orchestrator_run_id=self._orchestrator_run_id, + logs=logs_request, + ) + + # Get logs response from the run and set up logging context + logs_response = run.logs + if not logs_response and run.log_collection: + for log in run.log_collection: + if log.source == "orchestrator": + logs_response = log + break + + logs_context = ( + setup_pipeline_logging( snapshot=self._snapshot, - orchestrator_run_id=self._orchestrator_run_id, - logs=logs_request, + run_id=run.id, + logs_response=logs_response, ) + if logs_response + else nullcontext() + ) + with logs_context: assert ( self._snapshot.pipeline_spec ) # Always exists for new snapshots diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index cff6ba524ad..e312b5831be 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -18,6 +18,7 @@ import socket import threading import time +from contextlib import nullcontext from typing import List, Optional, Tuple, cast from uuid import UUID @@ -243,7 +244,7 @@ def main() -> None: namespace=namespace, job_name=job_name, ) - existing_logs_response = None + logs_response = None if run_id and orchestrator_run_id: logger.info("Continuing existing run `%s`.", run_id) @@ -259,23 +260,40 @@ def main() -> None: # Continue logging to the same log file if it exists for log_response in pipeline_run.log_collection or []: if log_response.source == "orchestrator": - existing_logs_response = log_response + logs_response = log_response break else: orchestrator_run_id = orchestrator_pod_name + + # Generate logs request for orchestrator logging + from zenml.logging.logging import generate_logs_request + + logs_request = generate_logs_request(source="orchestrator") + if args.run_id: pipeline_run = client.zen_store.update_run( run_id=args.run_id, run_update=PipelineRunUpdate( - orchestrator_run_id=orchestrator_run_id + orchestrator_run_id=orchestrator_run_id, + add_logs=[logs_request], ), ) else: pipeline_run = create_placeholder_run( snapshot=snapshot, orchestrator_run_id=orchestrator_run_id, + logs=logs_request, ) + # Get logs_response from the created/updated run + if pipeline_run.logs: + logs_response = pipeline_run.logs + elif pipeline_run.log_collection: + for log_response in pipeline_run.log_collection: + if log_response.source == "orchestrator": + logs_response = log_response + break + # Store in the job annotations so we can continue the run if the pod # is restarted kube_utils.update_job( @@ -292,10 +310,14 @@ def main() -> None: for step_name, step in snapshot.step_configurations.items() ] - logs_context = setup_orchestrator_logging( - run_id=pipeline_run.id, - snapshot=snapshot, - logs_response=existing_logs_response, + logs_context = ( + setup_orchestrator_logging( + run_id=pipeline_run.id, + snapshot=snapshot, + logs_response=logs_response, + ) + if logs_response + else nullcontext() ) with logs_context: diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index ed602a92728..72a1493d171 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -11,26 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Log stores allow you to collect and store logs from pipeline runs. - -ZenML log stores provide different backends for storing pipeline and step logs. -""" +"""Implements the log stores for ZenML.""" # Base log store from zenml.log_stores.base_log_store import ( - BaseLogStore, + BaseLogStore, BaseLogStoreConfig, BaseLogStoreFlavor, ) -# Default log store -from zenml.log_stores.default.default_log_store import ( - DefaultLogStore, +# Artifact log store +from zenml.log_stores.artifact.artifact_log_store import ( + ArtifactLogStore, +) +from zenml.log_stores.artifact.artifact_log_store_flavor import ( + ArtifactLogStoreConfig, + ArtifactLogStoreFlavor, ) -from zenml.log_stores.default.default_log_store_flavor import ( - DefaultLogStoreConfig, - DefaultLogStoreFlavor, -) # OpenTelemetry log store from zenml.log_stores.otel.otel_log_store import OtelLogStore @@ -41,28 +38,24 @@ # Datadog log store from zenml.log_stores.datadog.datadog_log_store import ( - DatadogLogStore, + DatadogLogStore, ) -from zenml.log_stores.datadog.datadog_flavor import ( - DatadogLogStoreConfig, - DatadogLogStoreFlavor, +from zenml.log_stores.otel.otel_flavor import ( + OtelLogStoreConfig, + OtelLogStoreFlavor, ) -# Utils -from zenml.log_stores.utils import fetch_logs - __all__ = [ + "ArtifactLogStore", + "ArtifactLogStoreConfig", + "ArtifactLogStoreFlavor", "BaseLogStore", "BaseLogStoreConfig", "BaseLogStoreFlavor", "DatadogLogStore", "DatadogLogStoreConfig", "DatadogLogStoreFlavor", - "DefaultLogStore", - "DefaultLogStoreConfig", - "DefaultLogStoreFlavor", "OtelLogStore", "OtelLogStoreConfig", "OtelLogStoreFlavor", - "fetch_logs", ] diff --git a/src/zenml/log_stores/default/__init__.py b/src/zenml/log_stores/artifact/__init__.py similarity index 93% rename from src/zenml/log_stores/default/__init__.py rename to src/zenml/log_stores/artifact/__init__.py index 2d8058ed404..bcdf6cc92d0 100644 --- a/src/zenml/log_stores/default/__init__.py +++ b/src/zenml/log_stores/artifact/__init__.py @@ -11,4 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Default log store implementation.""" \ No newline at end of file +"""Artifact log store implementation.""" + diff --git a/src/zenml/log_stores/default/artifact_store_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py similarity index 62% rename from src/zenml/log_stores/default/artifact_store_exporter.py rename to src/zenml/log_stores/artifact/artifact_log_exporter.py index cc3ada297fe..11ecbe473b3 100644 --- a/src/zenml/log_stores/default/artifact_store_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -13,56 +13,51 @@ # permissions and limitations under the License. """OpenTelemetry exporter that writes logs to ZenML artifact store.""" +import os import time -from typing import TYPE_CHECKING, List, Sequence -from uuid import uuid4 +from collections import defaultdict +from typing import TYPE_CHECKING, Dict, List, Sequence +from uuid import UUID, uuid4 +from opentelemetry import context as otel_context from opentelemetry.sdk._logs.export import LogExporter, LogExportResult if TYPE_CHECKING: from opentelemetry.sdk._logs import LogData - from zenml.artifact_stores import BaseArtifactStore + from zenml.logging.logging import LoggingContext +from zenml.artifacts.utils import _load_artifact_store +from zenml.client import Client from zenml.enums import LoggingLevels -from zenml.log_stores.default.default_log_store import remove_ansi_escape_codes +from zenml.log_stores.artifact.artifact_log_store import ( + remove_ansi_escape_codes, +) +from zenml.log_stores.base_log_store import DEFAULT_MESSAGE_SIZE +from zenml.log_stores.otel.otel_log_store import LOGGING_CONTEXT_KEY +from zenml.log_stores.utils import LogEntry from zenml.logger import get_logger -from zenml.logging.logging import DEFAULT_MESSAGE_SIZE, LogEntry from zenml.utils.time_utils import utc_now logger = get_logger(__name__) -class ArtifactStoreExporter(LogExporter): +class ArtifactLogExporter(LogExporter): """OpenTelemetry exporter that writes logs to ZenML artifact store. - This exporter adapts OpenTelemetry log records to the ZenML LogEntry format - and writes them as JSON lines to the artifact store. + Groups logs by context and writes them to the appropriate artifact store + location based on the filesystem type. """ - def __init__( - self, - logs_uri: str, - artifact_store: "BaseArtifactStore", - ): - """Initialize the artifact store exporter. - - Args: - logs_uri: URI where logs should be written. - artifact_store: The artifact store to write to. - """ - self.logs_uri = logs_uri - self.artifact_store = artifact_store - self.file_counter = 0 + def __init__(self) -> None: + """Initialize the exporter with file counters per context.""" + self.file_counters: Dict[UUID, int] = {} def export(self, batch: Sequence["LogData"]) -> LogExportResult: """Export a batch of logs to the artifact store. - Converts OTEL log records to ZenML LogEntry format with proper - message chunking and writes them as JSON lines. - Args: - batch: Sequence of LogData to export. + batch: Sequence of LogData to export (can be from multiple contexts). Returns: LogExportResult indicating success or failure. @@ -71,17 +66,31 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: return LogExportResult.SUCCESS try: - log_lines = [] + logs_by_context: Dict[UUID, List[str]] = defaultdict(list) + context_metadata: Dict[UUID, "LoggingContext"] = {} + for log_data in batch: - log_record = log_data.log_record + if not log_data.log_record.context: + continue + + context = otel_context.get_value( + LOGGING_CONTEXT_KEY, log_data.log_record.context + ) + if not context: + continue - entries = self._otel_record_to_log_entries(log_record) + log_id = context.log_model.id + context_metadata[log_id] = context + + entries = self._otel_record_to_log_entries(log_data.log_record) for entry in entries: json_line = entry.model_dump_json(exclude_none=True) - log_lines.append(json_line) + logs_by_context[log_id].append(json_line) - if log_lines: - self._write_to_artifact_store(log_lines) + for log_id, log_lines in logs_by_context.items(): + if log_lines: + context = context_metadata[log_id] + self._write_to_artifact_store(log_lines, context, log_id) return LogExportResult.SUCCESS @@ -92,10 +101,7 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: def _otel_record_to_log_entries( self, log_record: "LogData" ) -> List[LogEntry]: - """Convert an OTEL log record to one or more ZenML LogEntry objects. - - Handles message chunking for large messages and extracts all relevant - metadata from the OTEL record. + """Convert an OTEL log record to ZenML LogEntry objects. Args: log_record: The OpenTelemetry log record. @@ -183,10 +189,7 @@ def _map_severity_to_level(self, severity_text: str) -> LoggingLevels: return LoggingLevels.INFO def _split_to_chunks(self, message: str) -> List[str]: - """Split a large message into chunks. - - Properly handles UTF-8 boundaries to avoid breaking multi-byte characters. - This is the same logic from the original step_logging.py implementation. + """Split a large message into chunks, handling UTF-8 boundaries. Args: message: The message to split. @@ -199,10 +202,8 @@ def _split_to_chunks(self, message: str) -> List[str]: start = 0 while start < len(message_bytes): - # Calculate the end position for this chunk end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) - # Try to decode the chunk, backing up if we hit a UTF-8 boundary issue while end > start: chunk_bytes = message_bytes[start:end] try: @@ -210,10 +211,8 @@ def _split_to_chunks(self, message: str) -> List[str]: chunks.append(chunk_text) break except UnicodeDecodeError: - # If we can't decode, try a smaller chunk end -= 1 else: - # If we can't decode anything, use replacement characters end = min(start + DEFAULT_MESSAGE_SIZE, len(message_bytes)) chunks.append( message_bytes[start:end].decode("utf-8", errors="replace") @@ -223,52 +222,59 @@ def _split_to_chunks(self, message: str) -> List[str]: return chunks - def _write_to_artifact_store(self, log_lines: List[str]) -> None: + def _write_to_artifact_store( + self, + log_lines: List[str], + context: "LoggingContext", + log_id: UUID, + ) -> None: """Write log lines to the artifact store. - Generates a unique timestamped filename for each batch and writes - the log lines as newline-delimited JSON. - Args: log_lines: List of JSON-serialized log entries. + context: The LoggingContext containing log_model metadata. + log_id: The log ID for tracking file counters. """ - # Generate unique filename with timestamp and counter - # This matches the pattern from the original implementation - timestamp = int(time.time() * 1000) - self.file_counter += 1 + log_model = context.log_model + if not log_model.uri or not log_model.artifact_store_id: + logger.warning( + f"Skipping log write: missing uri or artifact_store_id for log {log_id}" + ) + return + + client = Client() + artifact_store = _load_artifact_store( + log_model.artifact_store_id, client.zen_store + ) - # Use the logs_uri as the base - append timestamp and counter - base_uri = self.logs_uri - if base_uri.endswith(".log"): - base_uri = base_uri[:-4] - - file_uri = f"{base_uri}_{timestamp}_{self.file_counter}.jsonl" + try: + content = "\n".join(log_lines) + "\n" - # Join lines and write (one JSON object per line) - content = "\n".join(log_lines) + "\n" + if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + timestamp = int(time.time() * 1000) + if log_id not in self.file_counters: + self.file_counters[log_id] = 0 + self.file_counters[log_id] += 1 - try: - # Write to artifact store - with self.artifact_store.open(file_uri, "w") as f: - f.write(content) + file_uri = os.path.join( + log_model.uri, + f"{timestamp}_{self.file_counters[log_id]}.jsonl", + ) - logger.debug(f"Wrote {len(log_lines)} log lines to {file_uri}") + with artifact_store.open(file_uri, "w") as f: + f.write(content) + else: + with artifact_store.open(log_model.uri, "a") as f: + f.write(content) except Exception as e: - logger.error(f"Failed to write logs to {file_uri}: {e}") + logger.error(f"Failed to write logs to {log_model.uri}: {e}") raise + finally: + artifact_store.cleanup() def shutdown(self) -> None: - """Shutdown the exporter and cleanup artifact store resources. - - This is important to prevent memory leaks by cleaning up any - cached connections or file handles held by the artifact store. - """ - if hasattr(self, "artifact_store") and self.artifact_store: - try: - self.artifact_store.cleanup() - logger.debug("Artifact store cleanup completed") - except Exception as e: - logger.warning(f"Error during artifact store cleanup: {e}") + """Shutdown the exporter.""" + pass def force_flush(self, timeout_millis: int = 30000) -> bool: """Force flush any buffered logs. @@ -277,7 +283,6 @@ def force_flush(self, timeout_millis: int = 30000) -> bool: timeout_millis: Timeout in milliseconds. Returns: - True if successful (always true - no buffering at this level). + True (no buffering at this level). """ - # No-op - OTEL BatchLogRecordProcessor handles all flushing return True diff --git a/src/zenml/log_stores/default/default_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py similarity index 87% rename from src/zenml/log_stores/default/default_log_store.py rename to src/zenml/log_stores/artifact/artifact_log_store.py index 832a2645b98..30417211be4 100644 --- a/src/zenml/log_stores/default/default_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Default log store implementation.""" +"""Artifact log store implementation.""" import os import re @@ -31,15 +31,13 @@ from zenml.client import Client from zenml.enums import LoggingLevels from zenml.exceptions import DoesNotExistException -from zenml.log_stores.default.default_log_store_flavor import ( - DefaultLogStoreConfig, +from zenml.log_stores.artifact.artifact_log_store_flavor import ( + ArtifactLogStoreConfig, ) +from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST from zenml.log_stores.otel.otel_log_store import OtelLogStore +from zenml.log_stores.utils import LogEntry from zenml.logger import get_logger -from zenml.logging.logging import ( - MAX_ENTRIES_PER_REQUEST, - LogEntry, -) from zenml.models import LogsResponse from zenml.utils.io_utils import sanitize_remote_path from zenml.zen_stores.base_zen_store import BaseZenStore @@ -229,45 +227,34 @@ def parse_log_entry(log_line: str) -> Optional[LogEntry]: ) -class DefaultLogStore(OtelLogStore): +class ArtifactLogStore(OtelLogStore): """Log store that saves logs to the artifact store. - This implementation extends OtelLogStore and uses the ArtifactStoreExporter + This implementation extends OtelLogStore and uses the ArtifactLogExporter to write logs to the artifact store. Inherits all OTEL infrastructure including shared BatchLogRecordProcessor and routing. """ @property - def config(self) -> DefaultLogStoreConfig: - """Returns the configuration of the default log store. + def config(self) -> ArtifactLogStoreConfig: + """Returns the configuration of the artifact log store. Returns: The configuration. """ - return cast(DefaultLogStoreConfig, self._config) + return cast(ArtifactLogStoreConfig, self._config) def get_exporter(self) -> "LogExporter": - """Get the artifact store exporter for this log store. + """Get the artifact log exporter for this log store. Returns: - The ArtifactStoreExporter instance. + The ArtifactLogExporter instance. """ - from zenml.log_stores.default.artifact_store_exporter import ( - ArtifactStoreExporter, + from zenml.log_stores.artifact.artifact_log_exporter import ( + ArtifactLogExporter, ) - from zenml.logging.logging import get_active_log_model - - log_model = get_active_log_model() - if not log_model: - raise RuntimeError( - "get_exporter() called outside of an active logging context. " - "This should not happen." - ) - return ArtifactStoreExporter( - logs_uri=log_model.uri, - artifact_store=Client().active_stack.artifact_store, - ) + return ArtifactLogExporter() def fetch( self, @@ -294,12 +281,12 @@ def fetch( """ if not logs_model.uri: raise ValueError( - "logs_model.uri is required for DefaultLogStore.fetch()" + "logs_model.uri is required for ArtifactLogStore.fetch()" ) if not logs_model.artifact_store_id: raise ValueError( - "logs_model.artifact_store_id is required for DefaultLogStore.fetch()" + "logs_model.artifact_store_id is required for ArtifactLogStore.fetch()" ) client = Client() diff --git a/src/zenml/log_stores/default/default_log_store_flavor.py b/src/zenml/log_stores/artifact/artifact_log_store_flavor.py similarity index 55% rename from src/zenml/log_stores/default/default_log_store_flavor.py rename to src/zenml/log_stores/artifact/artifact_log_store_flavor.py index 40f0a49676c..062b426378d 100644 --- a/src/zenml/log_stores/default/default_log_store_flavor.py +++ b/src/zenml/log_stores/artifact/artifact_log_store_flavor.py @@ -11,30 +11,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Default log store flavor implementation.""" +"""Artifact log store flavor implementation.""" from typing import TYPE_CHECKING, Type -from zenml.enums import StackComponentType -from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig -from zenml.stack.flavor import Flavor +from zenml.log_stores.otel.otel_flavor import ( + OtelLogStoreConfig, + OtelLogStoreFlavor, +) if TYPE_CHECKING: from zenml.log_stores.base_log_store import BaseLogStore -class DefaultLogStoreConfig(OtelLogStoreConfig): - """Configuration for the default log store. +class ArtifactLogStoreConfig(OtelLogStoreConfig): + """Configuration for the artifact log store. - This log store saves logs to the artifact store using OTEL infrastructure, - which is the default and backward-compatible approach. - - Inherits OTEL configuration like service_name, batch sizes, etc. + This log store saves logs to the artifact store using OTEL infrastructure. """ -class DefaultLogStoreFlavor(Flavor): - """Default log store flavor implementation.""" +class ArtifactLogStoreFlavor(OtelLogStoreFlavor): + """Artifact log store flavor implementation.""" @property def name(self) -> str: @@ -43,7 +41,7 @@ def name(self) -> str: Returns: The name of the flavor. """ - return "default" + return "artifact" @property def docs_url(self) -> str: @@ -52,16 +50,7 @@ def docs_url(self) -> str: Returns: The URL to the flavor documentation. """ - return "https://docs.zenml.io/stack-components/log-stores/default" - - @property - def sdk_docs_url(self) -> str: - """URL to the SDK docs for this flavor. - - Returns: - The URL to the SDK docs for this flavor. - """ - return self.docs_url + return "https://docs.zenml.io/stack-components/log-stores/artifact" @property def logo_url(self) -> str: @@ -70,26 +59,17 @@ def logo_url(self) -> str: Returns: The URL to the flavor logo. """ - # TODO: Add a logo for the default log store - return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/log_store/default.png" - - @property - def type(self) -> StackComponentType: - """Stack component type. - - Returns: - The stack component type. - """ - return StackComponentType.LOG_STORE + # TODO: Add a logo for the artifact log store + return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/log_store/artifact.png" @property - def config_class(self) -> Type[DefaultLogStoreConfig]: - """Returns `DefaultLogStoreConfig` config class. + def config_class(self) -> Type[ArtifactLogStoreConfig]: + """Returns `ArtifactLogStoreConfig` config class. Returns: The config class. """ - return DefaultLogStoreConfig + return ArtifactLogStoreConfig @property def implementation_class(self) -> Type["BaseLogStore"]: @@ -98,6 +78,8 @@ def implementation_class(self) -> Type["BaseLogStore"]: Returns: The implementation class. """ - from zenml.log_stores.default.default_log_store import DefaultLogStore + from zenml.log_stores.artifact.artifact_log_store import ( + ArtifactLogStore, + ) - return DefaultLogStore + return ArtifactLogStore diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index 74bf7df001a..5bd2bd7de53 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -19,16 +19,18 @@ from typing import TYPE_CHECKING, List, Optional, Type, cast from zenml.enums import StackComponentType -from zenml.logging.logging import ( - DEFAULT_MESSAGE_SIZE, - MAX_ENTRIES_PER_REQUEST, - LogEntry, -) from zenml.stack import Flavor, StackComponent, StackComponentConfig if TYPE_CHECKING: + from zenml.log_stores.utils import LogEntry + from zenml.logging.logging import LoggingContext from zenml.models import LogsResponse +# Maximum number of log entries to return in a single request +MAX_ENTRIES_PER_REQUEST = 20000 +# Maximum size of a single log message in bytes (5KB) +DEFAULT_MESSAGE_SIZE = 5 * 1024 + class BaseLogStoreConfig(StackComponentConfig): """Base configuration for all log stores.""" @@ -52,37 +54,21 @@ def config(self) -> BaseLogStoreConfig: return cast(BaseLogStoreConfig, self._config) @abstractmethod - def activate(self) -> None: - """Activate the log store for log collection. - - This method is called when ZenML needs to start collecting and storing - logs during pipeline or step execution. It should set up any necessary - handlers, threads, or connections. - """ - - @abstractmethod - def deactivate(self) -> None: - """Deactivate the log store and stop log collection. - - This method is called when ZenML needs to stop collecting logs. - It should clean up handlers, flush any pending logs, and shut down - any background threads or connections. - """ - - @abstractmethod - def emit(self, record: logging.LogRecord) -> None: - """Process a log record from the routing handler. + def emit( + self, + record: logging.LogRecord, + context: "LoggingContext", + ) -> None: + """Process a log record from the logging system. This method is called by the ZenML logging system for each log record that should be stored by this log store. Implementations should process the record according to their backend's requirements. - The default implementation does nothing. This allows log stores that - only need to collect logs during pipeline execution (via activate/ - deactivate) without real-time processing to skip implementing this. - Args: record: The Python logging.LogRecord to process. + context: The logging context containing the log_model with routing + metadata (pipeline_run_id, step_run_id, etc.). """ @abstractmethod @@ -102,7 +88,7 @@ def fetch( Each log store implementation can extract the information it needs from logs_model: - - DefaultLogStore: uses logs_model.uri and logs_model.artifact_store_id + - ArtifactLogStore: uses logs_model.uri and logs_model.artifact_store_id - OtelLogStore: uses logs_model.pipeline_run_id, step_run_id, source - DatadogLogStore: uses logs_model.pipeline_run_id, step_run_id, source diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 916c659430a..e7481d1811b 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -23,8 +23,8 @@ from zenml.enums import LoggingLevels from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreConfig from zenml.log_stores.otel.otel_log_store import OtelLogStore +from zenml.log_stores.utils import LogEntry from zenml.logger import get_logger -from zenml.logging.logging import LogEntry from zenml.models import LogsResponse logger = get_logger(__name__) diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index ca26c5c59b3..d7bcd0b383c 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -14,10 +14,12 @@ """OpenTelemetry log store implementation.""" import logging +import threading from abc import abstractmethod from datetime import datetime from typing import TYPE_CHECKING, Any, List, Optional, cast +from opentelemetry import context as otel_context from opentelemetry._logs.severity import SeverityNumber from opentelemetry.sdk._logs import LoggerProvider from opentelemetry.sdk._logs.export import BatchLogRecordProcessor @@ -32,10 +34,14 @@ if TYPE_CHECKING: from opentelemetry.sdk._logs.export import LogExporter - from zenml.logging.logging import LogEntry + from zenml.log_stores.utils import LogEntry + from zenml.logging.logging import LoggingContext logger = get_logger(__name__) +# Context key for passing LoggingContext through OTel's context system +LOGGING_CONTEXT_KEY = otel_context.create_key("zenml.logging_context") + class OtelLogStore(BaseLogStore): """Log store that exports logs using OpenTelemetry. @@ -45,7 +51,7 @@ class OtelLogStore(BaseLogStore): multiple log stores are active simultaneously. Subclasses should implement `get_exporter()` to provide the specific - log exporter for their backend (e.g., ArtifactStoreExporter, DatadogLogExporter). + log exporter for their backend (e.g., ArtifactLogExporter, DatadogLogExporter). """ def __init__(self, *args: Any, **kwargs: Any) -> None: @@ -61,6 +67,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._exporter: Optional["LogExporter"] = None self._provider: Optional["LoggerProvider"] = None self._processor: Optional["BatchLogRecordProcessor"] = None + self._activation_lock = threading.Lock() @property def config(self) -> OtelLogStoreConfig: @@ -84,15 +91,6 @@ def get_exporter(self) -> "LogExporter": def activate(self) -> None: """Activate log collection with OpenTelemetry.""" - from zenml.logging.logging import get_active_log_model - - log_model = get_active_log_model() - if not log_model: - raise RuntimeError( - "activate() called outside of an active logging context. " - "This should not happen." - ) - self._exporter = self.get_exporter() self._processor = BatchLogRecordProcessor(self._exporter) @@ -100,27 +98,37 @@ def activate(self) -> None: { "service.name": self.config.service_name, "service.version": __version__, - "zenml.log_id": str(log_model.id), } ) self._provider = LoggerProvider(resource=self._resource) self._provider.add_log_record_processor(self._processor) - def emit(self, record: logging.LogRecord) -> None: + def emit( + self, + record: logging.LogRecord, + context: "LoggingContext", + ) -> None: """Process a log record by sending to OpenTelemetry. Args: record: The log record to process. + context: The logging context containing the log_model. """ - if not self._provider: - return + with self._activation_lock: + if not self._provider: + self.activate() try: + # Attach the LoggingContext to OTel's context so the exporter + # can access it in the background processor thread + ctx = otel_context.set_value(LOGGING_CONTEXT_KEY, context) + otel_logger = self._provider.get_logger( record.name or "unknown", schema_url=None, ) + otel_logger.emit( timestamp=int(record.created * 1e9), observed_timestamp=int(record.created * 1e9), @@ -131,7 +139,10 @@ def emit(self, record: logging.LogRecord) -> None: "code.filepath": record.pathname, "code.lineno": record.lineno, "code.function": record.funcName, + "log_id": str(context.log_model.id), + "log_store_id": str(self.id), }, + context=ctx, ) except Exception: diff --git a/src/zenml/log_stores/utils.py b/src/zenml/log_stores/utils.py index 37632253281..45c85358a36 100644 --- a/src/zenml/log_stores/utils.py +++ b/src/zenml/log_stores/utils.py @@ -11,75 +11,52 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Utility functions for working with log stores.""" +"""Utilities for log stores.""" from datetime import datetime -from typing import TYPE_CHECKING, List, Optional +from typing import Optional +from uuid import UUID, uuid4 -if TYPE_CHECKING: - from zenml.logging.logging import LogEntry - from zenml.models import LogsResponse - from zenml.zen_stores.base_zen_store import BaseZenStore +from pydantic import BaseModel, Field +from zenml.enums import LoggingLevels -def fetch_logs( - logs: "LogsResponse", - zen_store: "BaseZenStore", - start_time: Optional[datetime] = None, - end_time: Optional[datetime] = None, - limit: int = 20000, -) -> List["LogEntry"]: - """Fetch logs using the appropriate log store. - This function determines which log store to use based on the log_store_id - in the logs record. If log_store_id is present, it loads that log store. - Otherwise, it falls back to DefaultLogStore. +class LogEntry(BaseModel): + """A structured log entry with parsed information.""" - Args: - logs: The logs model containing metadata and log_store_id. - zen_store: The zen store to fetch log store component from. - start_time: Filter logs after this time. - end_time: Filter logs before this time. - limit: Maximum number of log entries to return. - - Returns: - List of log entries. - """ - from zenml.enums import StackComponentType - from zenml.stack import StackComponent - - if logs.log_store_id: - log_store_model = zen_store.get_stack_component(logs.log_store_id) - log_store = StackComponent.from_model(log_store_model) - else: - from zenml.log_stores.default.default_log_store import ( - DefaultLogStore, - DefaultLogStoreConfig, - ) - from zenml.utils.time_utils import utc_now - - if not logs.artifact_store_id: - return [] - - artifact_store_model = zen_store.get_stack_component( - logs.artifact_store_id - ) - - log_store = DefaultLogStore( - name="default_log_store_fallback", - id=artifact_store_model.id, - config=DefaultLogStoreConfig(), - flavor="default", - type=StackComponentType.LOG_STORE, - user=artifact_store_model.user, - workspace=artifact_store_model.workspace, - created=utc_now(), - updated=utc_now(), - ) - - return log_store.fetch( - logs_model=logs, - start_time=start_time, - end_time=end_time, - limit=limit, + message: str = Field(description="The log message content") + name: Optional[str] = Field( + default=None, + description="The name of the logger", + ) + level: Optional[LoggingLevels] = Field( + default=None, + description="The log level", + ) + timestamp: Optional[datetime] = Field( + default=None, + description="When the log was created", + ) + module: Optional[str] = Field( + default=None, description="The module that generated this log entry" + ) + filename: Optional[str] = Field( + default=None, + description="The name of the file that generated this log entry", + ) + lineno: Optional[int] = Field( + default=None, description="The fileno that generated this log entry" + ) + chunk_index: int = Field( + default=0, + description="The index of the chunk in the log entry", + ) + total_chunks: int = Field( + default=1, + description="The total number of chunks in the log entry", + ) + id: UUID = Field( + default_factory=uuid4, + description="The unique identifier of the log entry", ) diff --git a/src/zenml/logger.py b/src/zenml/logger.py index b637a936261..bc3b35e4f33 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -35,8 +35,8 @@ "step_names_in_console", default=False ) -_original_stdout_write: Optional[Any] = None -_original_stderr_write: Optional[Any] = None +_original_stdout: Optional[Any] = None +_original_stderr: Optional[Any] = None _stdout_wrapped: bool = False _stderr_wrapped: bool = False @@ -214,108 +214,94 @@ def format(self, record: logging.LogRecord) -> str: def _wrapped_write(original_write: Any, stream_name: str) -> Any: - """Wrap stdout/stderr write method to parse and route logs.""" + """Wrap stdout/stderr write method to route logs to LoggingContext.""" def wrapped_write(text: str) -> int: - """Wrap the write method to parse and route logs.""" - from zenml.logging.logging import get_active_log_store - - message = text - name = "unknown" - level = ( - LoggingLevels.INFO - if stream_name == "info" - else LoggingLevels.ERROR + """Write method that routes logs through LoggingContext.""" + from zenml.logging.logging import LoggingContext + + level_int = logging.INFO if stream_name == "stdout" else logging.ERROR + + record = logging.LogRecord( + name=stream_name, + level=level_int, + pathname="", + lineno=0, + msg=text, + args=(), + exc_info=None, + func="", ) - level_int = getattr(logging, level.name) - pathname = "" - lineno = 0 - funcName = "" - - has_newline = text.endswith("\n") - - stripped_text = text.strip() - if stripped_text.startswith("{") and stripped_text.endswith("}"): - try: - data = json.loads(stripped_text) - if "zenml" in data and data["zenml"] is True: - message = data.get("msg", text) - name = data.get("name", name) - level_str = data.get("level", level.name) - if hasattr(LoggingLevels, level_str): - level = getattr(LoggingLevels, level_str) - level_int = getattr(logging, level.name) - pathname = data.get("filename", pathname) - lineno = data.get("lineno", lineno) - funcName = data.get("module", funcName) - except Exception: - pass - - log_store = get_active_log_store() - if log_store: - record = logging.LogRecord( - name=name, - level=level_int, - pathname=pathname, - lineno=lineno, - msg=message, - args=(), - exc_info=None, - func=funcName, - ) - log_store.emit(record) - - formatted_message = format_console_message(message, level) - if has_newline: - formatted_message += "\n" - - return original_write(formatted_message) + LoggingContext.emit(record) + + return original_write(text) return wrapped_write def wrap_stdout_stderr() -> None: - """Wrap stdout and stderr write methods.""" + """Wrap stdout and stderr write methods to route through LoggingContext.""" global _stdout_wrapped, _stderr_wrapped - global _original_stdout_write, _original_stderr_write + global _original_stdout, _original_stderr if not _stdout_wrapped: - _original_stdout_write = getattr(sys.stdout, "write") - setattr( - sys.stdout, - "write", - _wrapped_write(_original_stdout_write, "info"), - ) + _original_stdout = sys.stdout + original_write = sys.stdout.write + sys.stdout.write = _wrapped_write(original_write, "stdout") _stdout_wrapped = True if not _stderr_wrapped: - _original_stderr_write = getattr(sys.stderr, "write") - setattr( - sys.stderr, - "write", - _wrapped_write(_original_stderr_write, "error"), - ) + _original_stderr = sys.stderr + original_write = sys.stderr.write + sys.stderr.write = _wrapped_write(original_write, "stderr") _stderr_wrapped = True -def get_zenml_handler() -> Any: - """Get console handler for logging. +class ZenMLLoggingHandler(logging.Handler): + """Custom handler that routes logs through LoggingContext.""" + + def emit(self, record: logging.LogRecord) -> None: + """Emit a log record through LoggingContext. + + Args: + record: The log record to emit. + """ + from zenml.logging.logging import LoggingContext + + LoggingContext.emit(record) + + +def get_console_handler() -> logging.Handler: + """Get console handler that writes to original stdout. Returns: A console handler. """ - handler = logging.StreamHandler(sys.stdout) - handler.setFormatter(ZenMLFormatter()) + handler = logging.StreamHandler(_original_stdout) return handler +def get_zenml_handler() -> logging.Handler: + """Get ZenML handler that routes logs through LoggingContext. + + Returns: + A ZenML handler. + """ + return ZenMLLoggingHandler() + + def init_logging() -> None: """Initialize the logging system.""" set_root_verbosity() wrap_stdout_stderr() - # Add the ZenML handler to the root logger + # Add both handlers to the root logger root_logger = logging.getLogger() + + # Console handler - writes to original stdout + root_logger.addHandler(get_console_handler()) + + # ZenML handler - routes through LoggingContext root_logger.addHandler(get_zenml_handler()) # Mute tensorflow cuda warnings diff --git a/src/zenml/logging/__init__.py b/src/zenml/logging/__init__.py index 7d170426951..b38f8acd12a 100644 --- a/src/zenml/logging/__init__.py +++ b/src/zenml/logging/__init__.py @@ -11,3 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. +"""ZenML logging module.""" + +from zenml.logging.logging import ( + LoggingContext, + generate_logs_request, + setup_orchestrator_logging, + setup_pipeline_logging, +) + +__all__ = [ + "LoggingContext", + "generate_logs_request", + "setup_orchestrator_logging", + "setup_pipeline_logging", +] diff --git a/src/zenml/logging/logging.py b/src/zenml/logging/logging.py index 3fc87ae082f..7e44a5f1d6a 100644 --- a/src/zenml/logging/logging.py +++ b/src/zenml/logging/logging.py @@ -13,240 +13,117 @@ # permissions and limitations under the License. """ZenML logging.""" +import logging +import threading from contextlib import contextmanager, nullcontext from contextvars import ContextVar -from datetime import datetime from types import TracebackType from typing import ( - TYPE_CHECKING, Any, Generator, Optional, - Tuple, Type, - Union, ) from uuid import UUID, uuid4 -from pydantic import BaseModel, Field - from zenml.client import Client from zenml.constants import ( ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, handle_bool_env_var, ) -from zenml.enums import LoggingLevels from zenml.logger import get_logger from zenml.models import ( LogsRequest, LogsResponse, - PipelineRunUpdate, PipelineSnapshotResponse, ) -from zenml.utils.time_utils import utc_now logger = get_logger(__name__) -if TYPE_CHECKING: - from zenml.log_stores.base_log_store import BaseLogStore - from zenml.models import LogsRequest, LogsResponse - -# Maximum number of log entries to return in a single request -MAX_ENTRIES_PER_REQUEST = 20000 -# Maximum size of a single log message in bytes (5KB) -DEFAULT_MESSAGE_SIZE = 5 * 1024 - -# Active log store and its associated log model -_active_log_context: ContextVar[ - Optional[Tuple["BaseLogStore", Union["LogsRequest", "LogsResponse"]]] -] = ContextVar("active_log_context", default=None) +# Active logging context +active_logging_context: ContextVar[Optional["LoggingContext"]] = ContextVar( + "active_logging_context", default=None +) -def set_active_log_context( - log_store: Optional["BaseLogStore"], - log_model: Optional[Union["LogsRequest", "LogsResponse"]] = None, -) -> None: - """Set active log store and model for current context. +def generate_logs_request(source: str) -> LogsRequest: + """Generate a LogsRequest for logging. Args: - log_store: Log store to activate, or None to deactivate. - log_model: The log model associated with this context. - """ - if log_store is None: - _active_log_context.set(None) - else: - if log_model is None: - raise ValueError( - "log_model must be provided when log_store is set" - ) - _active_log_context.set((log_store, log_model)) - - -def get_active_log_context() -> Optional[ - Tuple["BaseLogStore", Union["LogsRequest", "LogsResponse"]] -]: - """Get the active log store and model for the current context. - - Returns: - Tuple of (log_store, log_model), or None if no context is active. - """ - return _active_log_context.get() - - -def get_active_log_store() -> Optional["BaseLogStore"]: - """Get the active log store for the current context. - - Returns: - The active log store, or None if no log store is active. - """ - context = _active_log_context.get() - return context[0] if context else None - - -def get_active_log_model() -> Optional[Union["LogsRequest", "LogsResponse"]]: - """Get the active log model for the current context. + source: The source of the logs (e.g., "client", "orchestrator", "step"). Returns: - The active log model, or None if no context is active. + A LogsRequest object. """ - context = _active_log_context.get() - return context[1] if context else None - - -class LogEntry(BaseModel): - """A structured log entry with parsed information.""" - - message: str = Field(description="The log message content") - name: Optional[str] = Field( - default=None, - description="The name of the logger", - ) - level: Optional[LoggingLevels] = Field( - default=None, - description="The log level", - ) - timestamp: Optional[datetime] = Field( - default=None, - description="When the log was created", - ) - module: Optional[str] = Field( - default=None, description="The module that generated this log entry" - ) - filename: Optional[str] = Field( - default=None, - description="The name of the file that generated this log entry", - ) - lineno: Optional[int] = Field( - default=None, description="The fileno that generated this log entry" - ) - chunk_index: int = Field( - default=0, - description="The index of the chunk in the log entry", - ) - total_chunks: int = Field( - default=1, - description="The total number of chunks in the log entry", - ) - id: UUID = Field( - default_factory=uuid4, - description="The unique identifier of the log entry", + from zenml.log_stores.artifact.artifact_log_store import ( + ArtifactLogStore, + prepare_logs_uri, ) + client = Client() + log_store = client.active_stack.log_store + log_id = uuid4() + + if isinstance(log_store, ArtifactLogStore): + artifact_store = client.active_stack.artifact_store + return LogsRequest( + id=log_id, + source=source, + uri=prepare_logs_uri( + artifact_store=artifact_store, + log_id=log_id, + ), + artifact_store_id=artifact_store.id, + ) + else: + return LogsRequest( + id=log_id, + source=source, + log_store_id=log_store.id if log_store else None, + ) + class LoggingContext: """Context manager which collects logs using a LogStore.""" def __init__( self, - source: str, - log_model: Optional[Union["LogsRequest", "LogsResponse"]] = None, + log_model: LogsResponse, ) -> None: """Initialize the logging context. Args: - source: An identifier for the source of the logs - (e.g., "step", "orchestrator") - log_model: The log model to use for the logging context + log_model: The logs response model for this context. """ - if Client().active_stack.log_store: - self.log_store = Client().active_stack.log_store - else: - from zenml.log_stores import ( - DefaultLogStore, - DefaultLogStoreConfig, - DefaultLogStoreFlavor, - ) - - default_log_store_flavor = DefaultLogStoreFlavor() - - self.log_store = DefaultLogStore( - id=uuid4(), - name="temporary_default", - flavor=default_log_store_flavor.name, - type=default_log_store_flavor.type, - config=DefaultLogStoreConfig(), - environment={}, - user=Client().active_user.id, - created=utc_now(), - updated=utc_now(), - secrets=[], - ) - - self.source = source self.log_model = log_model + self._lock = threading.Lock() + self._previous_context: Optional[LoggingContext] = None - self._previous_log_context: Optional[ - Tuple["BaseLogStore", Union["LogsRequest", "LogsResponse"]] - ] = None + @classmethod + def emit(cls, record: logging.LogRecord) -> None: + """Emit a log record using the active logging context. - def generate_log_request(self) -> "LogsRequest": - """Create a log request model. + This class method is called by stdout/stderr wrappers and logging + handlers to route logs to the active log store. - Returns: - The log request model. + Args: + record: The log record to emit. """ - if self.log_model is not None: - - from zenml.log_stores.default.default_log_store import ( - DefaultLogStore, - prepare_logs_uri, - ) - - if isinstance(self.log_store, DefaultLogStore): - log_id = uuid4() - artifact_store = Client().active_stack.artifact_store - - return LogsRequest( - id=log_id, - source=self.source, - uri=prepare_logs_uri( - artifact_store=artifact_store, - log_id=log_id, - ), - artifact_store_id=artifact_store.id, - ) - else: - return LogsRequest( - id=uuid4(), - source=self.source, - log_store_id=self.log_store.id, - ) + try: + if context := active_logging_context.get(): + Client().active_stack.log_store.emit(record, context) + except Exception: + pass def __enter__(self) -> "LoggingContext": - """Enter the context and activate log collection. - - Saves the current active context to restore it on exit, - enabling nested logging contexts. + """Enter the context and set as active. Returns: self """ - self._previous_log_context = get_active_log_context() - - # Set the active context before activating the log store - # so that activate() can access the log model from context - set_active_log_context(self.log_store, self.log_model) - self.log_store.activate() + with self._lock: + self._previous_context = active_logging_context.get() + active_logging_context.set(self) return self @@ -256,9 +133,7 @@ def __exit__( exc_val: Optional[BaseException], exc_tb: Optional[TracebackType], ) -> None: - """Exit the context and deactivate log collection. - - Restores the previous active context to support nested contexts. + """Exit the context and restore previous context. Args: exc_type: The class of the exception. @@ -271,20 +146,15 @@ def __exit__( exc_info=(exc_type, exc_val, exc_tb) if exc_val else None, ) - self.log_store.deactivate() - - if self._previous_log_context: - set_active_log_context( - self._previous_log_context[0], - self._previous_log_context[1], - ) - else: - set_active_log_context(None) + with self._lock: + active_logging_context.set(self._previous_context) +# TODO: Adjust the usage of this function def setup_orchestrator_logging( run_id: UUID, snapshot: "PipelineSnapshotResponse", + logs_response: LogsResponse, ) -> Any: """Set up logging for an orchestrator environment. @@ -294,12 +164,11 @@ def setup_orchestrator_logging( Args: run_id: The pipeline run ID. snapshot: The snapshot of the pipeline run. - logs_response: The logs response to continue from. + logs_response: The logs response for this orchestrator context. Returns: - The logs context + The logs context or nullcontext if logging is disabled. """ - # TODO: we need to establish the connection here again. try: logging_enabled = True @@ -317,7 +186,7 @@ def setup_orchestrator_logging( if not logging_enabled: return nullcontext() - return LoggingContext(source="orchestrator") + return LoggingContext(log_model=logs_response) except Exception as e: logger.error( f"Failed to setup orchestrator logging for run {run_id}: {e}" @@ -325,27 +194,22 @@ def setup_orchestrator_logging( return nullcontext() -# TODO: Double check this function +# TODO: Adjust the usage of this function @contextmanager def setup_pipeline_logging( - source: str, snapshot: "PipelineSnapshotResponse", - run_id: Optional[UUID] = None, - logs_response: Optional[LogsResponse] = None, -) -> Generator[Optional[LogsRequest], None, None]: + run_id: UUID, + logs_response: LogsResponse, +) -> Generator[LogsResponse, None, None]: """Set up logging for a pipeline run. Args: - source: The log source. snapshot: The snapshot of the pipeline run. run_id: The ID of the pipeline run. - logs_response: The logs response to continue from. - - Raises: - Exception: If updating the run with the logs request fails. + logs_response: The logs response for this pipeline context. Yields: - The logs request. + The logs response. """ logging_enabled = True @@ -355,23 +219,7 @@ def setup_pipeline_logging( logging_enabled = snapshot.pipeline_configuration.enable_pipeline_logs if logging_enabled: - client = Client() - - logs_context = LoggingContext(source="client", log_model=logs_response) - - if run_id and logs_response is None: - try: - run_update = PipelineRunUpdate( - add_logs=[logs_context.log_model] - ) - client.zen_store.update_run( - run_id=run_id, run_update=run_update - ) - except Exception as e: - logger.error(f"Failed to add logs to the run {run_id}: {e}") - raise e - - with logs_context: - yield logs_context.log_model + with LoggingContext(log_model=logs_response): + yield logs_response else: - yield None + yield logs_response diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index e28419218af..375777ec233 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -15,7 +15,6 @@ import signal import time -from contextlib import nullcontext from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple from zenml.client import Client @@ -30,6 +29,7 @@ from zenml.environment import get_run_environment_dict from zenml.exceptions import RunInterruptedException, RunStoppedException from zenml.logger import get_logger +from zenml.logging import LoggingContext from zenml.logging import logging as zenml_logging from zenml.models import ( PipelineRunRequest, @@ -41,7 +41,6 @@ from zenml.orchestrators import output_utils, publish_utils, step_run_utils from zenml.orchestrators import utils as orchestrator_utils from zenml.orchestrators.step_runner import StepRunner -from zenml.pipelines.build_utils import log_code_repository_usage from zenml.stack import Stack from zenml.steps import StepHeartBeatTerminationException, StepHeartbeatWorker from zenml.utils import env_utils, exception_utils, string_utils @@ -271,77 +270,82 @@ def launch(self) -> StepRunResponse: is_enabled_on_pipeline=self._snapshot.pipeline_configuration.enable_step_logs, ) - logs_context = nullcontext() logs_request = None if step_logging_enabled: - logs_context = zenml_logging.LoggingContext(source="step") - logs_request = logs_context.log_model + logs_request = zenml_logging.generate_logs_request(source="step") - with logs_context: - if run_was_created: - pipeline_run_metadata = self._stack.get_pipeline_run_metadata( - run_id=pipeline_run.id - ) - publish_utils.publish_pipeline_run_metadata( - pipeline_run_id=pipeline_run.id, - pipeline_run_metadata=pipeline_run_metadata, - ) - if model_version := pipeline_run.model_version: - step_run_utils.log_model_version_dashboard_url( - model_version=model_version - ) - - request_factory = step_run_utils.StepRunRequestFactory( - snapshot=self._snapshot, - pipeline_run=pipeline_run, - stack=self._stack, + if run_was_created: + pipeline_run_metadata = self._stack.get_pipeline_run_metadata( + run_id=pipeline_run.id ) - dynamic_config = self._step if self._snapshot.is_dynamic else None - step_run_request = request_factory.create_request( - invocation_id=self._invocation_id, - dynamic_config=dynamic_config, + publish_utils.publish_pipeline_run_metadata( + pipeline_run_id=pipeline_run.id, + pipeline_run_metadata=pipeline_run_metadata, ) - step_run_request.logs = logs_request - - try: - request_factory.populate_request(request=step_run_request) - except BaseException as e: - logger.exception( - f"Failed preparing step `{self._invocation_id}`." + if model_version := pipeline_run.model_version: + step_run_utils.log_model_version_dashboard_url( + model_version=model_version ) - step_run_request.status = ExecutionStatus.FAILED - step_run_request.end_time = utc_now() - step_run_request.exception_info = ( - exception_utils.collect_exception_information(e) + + request_factory = step_run_utils.StepRunRequestFactory( + snapshot=self._snapshot, + pipeline_run=pipeline_run, + stack=self._stack, + ) + dynamic_config = self._step if self._snapshot.is_dynamic else None + step_run_request = request_factory.create_request( + invocation_id=self._invocation_id, + dynamic_config=dynamic_config, + ) + step_run_request.logs = logs_request + + try: + request_factory.populate_request(request=step_run_request) + except BaseException as e: + logger.exception(f"Failed preparing step `{self._invocation_id}`.") + step_run_request.status = ExecutionStatus.FAILED + step_run_request.end_time = utc_now() + step_run_request.exception_info = ( + exception_utils.collect_exception_information(e) + ) + raise + finally: + step_run = Client().zen_store.create_run_step(step_run_request) + self._step_run = step_run + if model_version := step_run.model_version: + step_run_utils.log_model_version_dashboard_url( + model_version=model_version ) - raise - finally: - step_run = Client().zen_store.create_run_step(step_run_request) - self._step_run = step_run - if model_version := step_run.model_version: - step_run_utils.log_model_version_dashboard_url( - model_version=model_version - ) if not step_run.status.is_finished: logger.info(f"Step `{self._invocation_id}` has started.") - try: - self._run_step( - pipeline_run=pipeline_run, - step_run=step_run, - force_write_logs=force_write_logs, - ) - except RunStoppedException as e: - raise e - except BaseException as e: # noqa: E722 - logger.error( - "Failed to run step `%s`: %s", - self._invocation_id, - e, + if step_run.logs: + logs_context = LoggingContext(log_model=step_run.logs) + else: + logger.debug( + "There is no LogsResponseModel prepared for the step. The" + "step logging storage is disabled." ) - publish_utils.publish_failed_step_run(step_run.id) - raise + + with logs_context: + try: + # TODO: We still need to apply the fix for step operators here + self._run_step( + pipeline_run=pipeline_run, + step_run=step_run, + force_write_logs=lambda: None, + ) + except RunStoppedException as e: + raise e + except BaseException as e: # noqa: E722 + logger.error( + "Failed to run step `%s`: %s", + self._invocation_id, + e, + ) + publish_utils.publish_failed_step_run(step_run.id) + raise else: logger.info( f"Using cached version of step `{self._invocation_id}`." diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index 911ae60e11d..d40e5f7fb40 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -152,7 +152,7 @@ def run( logs_context = nullcontext() if step_logging_enabled: if step_run.logs: - logs_context = LoggingContext(source="step") + logs_context = LoggingContext(log_model=step_run.logs) else: logger.debug( "There is no LogsResponseModel prepared for the step. The" diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index 99c4531244c..86e9111b982 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -1046,22 +1046,23 @@ def _run( logs_context = nullcontext() logs_request = None + snapshot = self._create_snapshot(**self._run_args) + self.log_pipeline_snapshot_metadata(snapshot) + if logging_enabled: - logs_context = LoggingContext("client") - logs_request = logs_context.log_model + from zenml.logging.logging import generate_logs_request - with logs_context: - snapshot = self._create_snapshot(**self._run_args) + logs_request = generate_logs_request(source="client") - self.log_pipeline_snapshot_metadata(snapshot) - run = ( - create_placeholder_run( - snapshot=snapshot, logs=logs_request - ) - if not snapshot.schedule - else None - ) + run = ( + create_placeholder_run(snapshot=snapshot, logs=logs_request) + if not snapshot.schedule + else None + ) + if logging_enabled and run and run.logs: + logs_context = LoggingContext(log_model=run.logs) + with logs_context: analytics_handler.metadata = ( self._get_pipeline_analytics_metadata( snapshot=snapshot, diff --git a/src/zenml/stack/flavor_registry.py b/src/zenml/stack/flavor_registry.py index 7244e1263d5..653071d316d 100644 --- a/src/zenml/stack/flavor_registry.py +++ b/src/zenml/stack/flavor_registry.py @@ -70,8 +70,8 @@ def builtin_flavors(self) -> List[Type[Flavor]]: from zenml.deployers import DockerDeployerFlavor, LocalDeployerFlavor from zenml.image_builders import LocalImageBuilderFlavor from zenml.log_stores import ( + ArtifactLogStoreFlavor, DatadogLogStoreFlavor, - DefaultLogStoreFlavor, ) from zenml.orchestrators import ( LocalDockerOrchestratorFlavor, @@ -89,7 +89,7 @@ def builtin_flavors(self) -> List[Type[Flavor]]: GitHubContainerRegistryFlavor, LocalImageBuilderFlavor, DockerDeployerFlavor, - DefaultLogStoreFlavor, + ArtifactLogStoreFlavor, DatadogLogStoreFlavor, LocalDeployerFlavor, ] diff --git a/src/zenml/stack/stack.py b/src/zenml/stack/stack.py index 902471fb902..a7739708c19 100644 --- a/src/zenml/stack/stack.py +++ b/src/zenml/stack/stack.py @@ -529,12 +529,42 @@ def deployer(self) -> Optional["BaseDeployer"]: return self._deployer @property - def log_store(self) -> Optional["BaseLogStore"]: + def log_store(self) -> "BaseLogStore": """The log store of the stack. + If no log store is configured, returns a temporary default + ArtifactLogStore. + Returns: The log store of the stack. """ + if self._log_store: + return self._log_store + + # Default to ArtifactLogStore if none configured + from uuid import uuid4 + + from zenml.log_stores import ( + ArtifactLogStore, + ArtifactLogStoreConfig, + ArtifactLogStoreFlavor, + ) + + flavor = ArtifactLogStoreFlavor() + now = utc_now() + + self._log_store = ArtifactLogStore( + id=uuid4(), + name="default", + flavor=flavor.name, + type=flavor.type, + config=ArtifactLogStoreConfig(), + environment={}, + user=Client().active_user.id, + created=now, + updated=now, + secrets=[], + ) return self._log_store def dict(self) -> Dict[str, str]: diff --git a/src/zenml/zen_server/routers/runs_endpoints.py b/src/zenml/zen_server/routers/runs_endpoints.py index c0a7b1ec26c..3182ef29f29 100644 --- a/src/zenml/zen_server/routers/runs_endpoints.py +++ b/src/zenml/zen_server/routers/runs_endpoints.py @@ -468,7 +468,7 @@ def run_logs( if ( snapshot.template_id or snapshot.source_snapshot_id ) and server_config().workload_manager_enabled: - from zenml.log_stores.default.default_log_store import ( + from zenml.log_stores.artifact.artifact_log_store import ( parse_log_entry, ) diff --git a/src/zenml/zen_server/routers/steps_endpoints.py b/src/zenml/zen_server/routers/steps_endpoints.py index 7b500020d05..7e72e885e4c 100644 --- a/src/zenml/zen_server/routers/steps_endpoints.py +++ b/src/zenml/zen_server/routers/steps_endpoints.py @@ -30,7 +30,8 @@ from zenml.enums import ExecutionStatus from zenml.exceptions import AuthorizationException from zenml.log_stores import fetch_logs -from zenml.logging.logging import MAX_ENTRIES_PER_REQUEST, LogEntry +from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST +from zenml.log_stores.utils import LogEntry from zenml.models import ( Page, StepRunFilter, diff --git a/tests/integration/functional/zen_stores/test_zen_store.py b/tests/integration/functional/zen_stores/test_zen_store.py index a49d23785f5..c7a5dbb9e59 100644 --- a/tests/integration/functional/zen_stores/test_zen_store.py +++ b/tests/integration/functional/zen_stores/test_zen_store.py @@ -84,7 +84,6 @@ EntityExistsError, IllegalOperationError, ) -from zenml.logging.step_logging import fetch_log_records, prepare_logs_uri from zenml.metadata.metadata_types import MetadataTypeEnum from zenml.models import ( APIKeyFilter, From d4243af9d85589f0df0888468880703b2b5f25e6 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 18 Nov 2025 14:41:27 +0100 Subject: [PATCH 22/81] new changes --- src/zenml/log_stores/__init__.py | 6 +- .../log_stores/artifact/artifact_log_store.py | 11 +- .../datadog/datadog_log_exporter.py | 123 ++++++++++++++++++ .../log_stores/datadog/datadog_log_store.py | 106 +-------------- src/zenml/log_stores/otel/otel_log_store.py | 4 +- 5 files changed, 137 insertions(+), 113 deletions(-) create mode 100644 src/zenml/log_stores/datadog/datadog_log_exporter.py diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index 72a1493d171..f288ef5d05a 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -40,9 +40,9 @@ from zenml.log_stores.datadog.datadog_log_store import ( DatadogLogStore, ) -from zenml.log_stores.otel.otel_flavor import ( - OtelLogStoreConfig, - OtelLogStoreFlavor, +from zenml.log_stores.datadog.datadog_flavor import ( + DatadogLogStoreConfig, + DatadogLogStoreFlavor, ) __all__ = [ diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index 30417211be4..261b2ffa0d3 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -17,7 +17,6 @@ import re from datetime import datetime from typing import ( - TYPE_CHECKING, Iterator, List, Optional, @@ -26,6 +25,8 @@ ) from uuid import UUID +from opentelemetry.sdk._logs.export import LogExporter + from zenml.artifact_stores import BaseArtifactStore from zenml.artifacts.utils import _load_artifact_store from zenml.client import Client @@ -42,11 +43,6 @@ from zenml.utils.io_utils import sanitize_remote_path from zenml.zen_stores.base_zen_store import BaseZenStore -if TYPE_CHECKING: - from opentelemetry.sdk._logs.export import LogExporter - - from zenml.artifact_stores import BaseArtifactStore - ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") logger = get_logger(__name__) @@ -286,7 +282,8 @@ def fetch( if not logs_model.artifact_store_id: raise ValueError( - "logs_model.artifact_store_id is required for ArtifactLogStore.fetch()" + "logs_model.artifact_store_id is required " + "for ArtifactLogStore.fetch()" ) client = Client() diff --git a/src/zenml/log_stores/datadog/datadog_log_exporter.py b/src/zenml/log_stores/datadog/datadog_log_exporter.py new file mode 100644 index 00000000000..d8f7937a6e0 --- /dev/null +++ b/src/zenml/log_stores/datadog/datadog_log_exporter.py @@ -0,0 +1,123 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""OpenTelemetry exporter that sends logs to Datadog.""" + +from typing import Any, List + +import requests +from opentelemetry.sdk._logs import LogData +from opentelemetry.sdk._logs.export import LogExporter, LogExportResult + +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +class DatadogLogExporter(LogExporter): + """Custom log exporter that sends logs to Datadog's HTTP intake API. + + This exporter transforms OpenTelemetry log records into Datadog's format + and sends them via HTTP POST without requiring the Datadog SDK. + """ + + def __init__( + self, + api_key: str, + site: str = "datadoghq.com", + ): + """Initialize the Datadog log exporter. + + Args: + api_key: Datadog API key. + site: Datadog site domain. + """ + self.endpoint = f"https://http-intake.logs.{site}/v1/input" + self.headers = { + "DD-API-KEY": api_key, + "Content-Type": "application/json", + } + + def export(self, batch: List[LogData]) -> Any: + """Export a batch of log records to Datadog. + + Args: + batch: List of LogData objects from OpenTelemetry. + + Returns: + LogExportResult indicating success or failure. + """ + logs = [] + for log_data in batch: + log_record = log_data.log_record + + resource_attrs = {} + if log_record.resource: + resource_attrs = dict(log_record.resource.attributes) + + log_attrs = {} + if log_record.attributes: + log_attrs = dict(log_record.attributes) + + all_attrs = {**resource_attrs, **log_attrs} + + log_entry = { + "message": str(log_record.body), + } + + if log_record.severity_text: + log_entry["status"] = log_record.severity_text.lower() + + if log_record.timestamp: + log_entry["timestamp"] = int(log_record.timestamp / 1_000_000) + + if all_attrs: + tags = [f"{k}:{v}" for k, v in all_attrs.items()] + log_entry["ddtags"] = ",".join(tags) + + logs.append(log_entry) + + try: + response = requests.post( + self.endpoint, + headers=self.headers, + json=logs, + timeout=10, + ) + + if response.status_code in [200, 202]: + logger.debug(f"Successfully sent {len(logs)} logs to Datadog") + return LogExportResult.SUCCESS + else: + logger.warning( + f"Datadog rejected logs: {response.status_code} - {response.text[:200]}" + ) + return LogExportResult.FAILURE + except Exception as e: + logger.error(f"Failed to export logs to Datadog: {e}") + return LogExportResult.FAILURE + + def shutdown(self) -> None: + """Shutdown the exporter.""" + pass + + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Force flush any buffered logs. + + Args: + timeout_millis: Timeout in milliseconds. + + Returns: + True if successful. + """ + return True diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index e7481d1811b..cb442b3cd6d 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -17,8 +17,7 @@ from typing import Any, Dict, List, Optional, cast import requests -from opentelemetry.sdk._logs import LogData -from opentelemetry.sdk._logs.export import LogExporter, LogExportResult +from opentelemetry.sdk._logs.export import LogExporter from zenml.enums import LoggingLevels from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreConfig @@ -30,105 +29,6 @@ logger = get_logger(__name__) -class DatadogLogExporter(LogExporter): - """Custom log exporter that sends logs to Datadog's HTTP intake API. - - This exporter transforms OpenTelemetry log records into Datadog's format - and sends them via HTTP POST without requiring the Datadog SDK. - """ - - def __init__( - self, - api_key: str, - site: str = "datadoghq.com", - ): - """Initialize the Datadog log exporter. - - Args: - api_key: Datadog API key. - site: Datadog site domain. - """ - self.endpoint = f"https://http-intake.logs.{site}/v1/input" - self.headers = { - "DD-API-KEY": api_key, - "Content-Type": "application/json", - } - - def export(self, batch: List[LogData]) -> Any: - """Export a batch of log records to Datadog. - - Args: - batch: List of LogData objects from OpenTelemetry. - - Returns: - LogExportResult indicating success or failure. - """ - logs = [] - for log_data in batch: - log_record = log_data.log_record - - resource_attrs = {} - if log_record.resource: - resource_attrs = dict(log_record.resource.attributes) - - log_attrs = {} - if log_record.attributes: - log_attrs = dict(log_record.attributes) - - all_attrs = {**resource_attrs, **log_attrs} - - log_entry = { - "message": str(log_record.body), - } - - if log_record.severity_text: - log_entry["status"] = log_record.severity_text.lower() - - if log_record.timestamp: - log_entry["timestamp"] = int(log_record.timestamp / 1_000_000) - - if all_attrs: - tags = [f"{k}:{v}" for k, v in all_attrs.items()] - log_entry["ddtags"] = ",".join(tags) - - logs.append(log_entry) - - try: - response = requests.post( - self.endpoint, - headers=self.headers, - json=logs, - timeout=10, - ) - - if response.status_code in [200, 202]: - logger.debug(f"Successfully sent {len(logs)} logs to Datadog") - return LogExportResult.SUCCESS - else: - logger.warning( - f"Datadog rejected logs: {response.status_code} - {response.text[:200]}" - ) - return LogExportResult.FAILURE - except Exception as e: - logger.error(f"Failed to export logs to Datadog: {e}") - return LogExportResult.FAILURE - - def shutdown(self) -> None: - """Shutdown the exporter.""" - pass - - def force_flush(self, timeout_millis: int = 30000) -> bool: - """Force flush any buffered logs. - - Args: - timeout_millis: Timeout in milliseconds. - - Returns: - True if successful. - """ - return True - - class DatadogLogStore(OtelLogStore): """Log store that exports logs to Datadog. @@ -151,6 +51,10 @@ def get_exporter(self) -> "LogExporter": Returns: DatadogLogExporter configured with API key and site. """ + from zenml.log_stores.datadog.datadog_log_exporter import ( + DatadogLogExporter, + ) + return DatadogLogExporter( api_key=self.config.api_key.get_secret_value(), site=self.config.site, diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index d7bcd0b383c..07c9693c39c 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -123,12 +123,12 @@ def emit( # Attach the LoggingContext to OTel's context so the exporter # can access it in the background processor thread ctx = otel_context.set_value(LOGGING_CONTEXT_KEY, context) - + otel_logger = self._provider.get_logger( record.name or "unknown", schema_url=None, ) - + otel_logger.emit( timestamp=int(record.created * 1e9), observed_timestamp=int(record.created * 1e9), From 07eb569209a1532b89d5ace029a62134f72769da Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 18 Nov 2025 14:54:43 +0100 Subject: [PATCH 23/81] stack changes --- src/zenml/stack/stack.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/src/zenml/stack/stack.py b/src/zenml/stack/stack.py index a7739708c19..72d181e1d21 100644 --- a/src/zenml/stack/stack.py +++ b/src/zenml/stack/stack.py @@ -540,32 +540,31 @@ def log_store(self) -> "BaseLogStore": """ if self._log_store: return self._log_store + else: + from uuid import uuid4 - # Default to ArtifactLogStore if none configured - from uuid import uuid4 + from zenml.log_stores import ( + ArtifactLogStore, + ArtifactLogStoreConfig, + ArtifactLogStoreFlavor, + ) - from zenml.log_stores import ( - ArtifactLogStore, - ArtifactLogStoreConfig, - ArtifactLogStoreFlavor, - ) + flavor = ArtifactLogStoreFlavor() + now = utc_now() - flavor = ArtifactLogStoreFlavor() - now = utc_now() - - self._log_store = ArtifactLogStore( - id=uuid4(), - name="default", - flavor=flavor.name, - type=flavor.type, - config=ArtifactLogStoreConfig(), - environment={}, - user=Client().active_user.id, - created=now, - updated=now, - secrets=[], - ) - return self._log_store + self._log_store = ArtifactLogStore( + id=uuid4(), + name="default", + flavor=flavor.name, + type=flavor.type, + config=ArtifactLogStoreConfig(), + environment={}, + user=Client().active_user.id, + created=now, + updated=now, + secrets=[], + ) + return self._log_store def dict(self) -> Dict[str, str]: """Converts the stack into a dictionary. From 9e49f3bdd30dc9e972994fc92fd79c0d52e66563 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 18 Nov 2025 15:54:06 +0100 Subject: [PATCH 24/81] more fixes --- .../artifact/artifact_log_exporter.py | 11 -- src/zenml/logger.py | 169 +++++++++--------- .../versions/5c0a1c787128_add_log_stores.py | 4 +- 3 files changed, 91 insertions(+), 93 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 11ecbe473b3..b442c5a8d8d 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -275,14 +275,3 @@ def _write_to_artifact_store( def shutdown(self) -> None: """Shutdown the exporter.""" pass - - def force_flush(self, timeout_millis: int = 30000) -> bool: - """Force flush any buffered logs. - - Args: - timeout_millis: Timeout in milliseconds. - - Returns: - True (no buffering at this level). - """ - return True diff --git a/src/zenml/logger.py b/src/zenml/logger.py index bc3b35e4f33..b1f6ddb3626 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -16,9 +16,10 @@ import json import logging import os +import re import sys from contextvars import ContextVar -from typing import Any, Optional +from typing import Any, Dict, Optional from rich.traceback import install as rich_tb_install @@ -31,6 +32,10 @@ ) from zenml.enums import LoggingLevels +ZENML_LOGGING_COLORS_DISABLED = handle_bool_env_var( + ENV_ZENML_LOGGING_COLORS_DISABLED, False +) + step_names_in_console: ContextVar[bool] = ContextVar( "step_names_in_console", default=False ) @@ -83,44 +88,36 @@ def _add_step_name_to_message(message: str) -> str: return message -def format_console_message( - message: str, level: LoggingLevels = LoggingLevels.INFO -) -> str: - """Format a message for console output with colors and step names. - - This function applies: - 1. Step name prefixing (if step_names_in_console is True) - 2. Color formatting (unless ZENML_LOGGING_COLORS_DISABLED) - 3. Special formatting for quoted text (purple) and URLs (blue) - - Args: - message: The message to format. - level: The logging level for color selection. - - Returns: - The formatted message. - """ - import re +class ConsoleFormatter(logging.Formatter): + """Formats logs according to custom specifications.""" - try: - if step_names_in_console.get(): - message = _add_step_name_to_message(message) - except Exception: - pass + grey: str = "\x1b[90m" + white: str = "\x1b[37m" + pink: str = "\x1b[35m" + green: str = "\x1b[32m" + yellow: str = "\x1b[33m" + red: str = "\x1b[31m" + cyan: str = "\x1b[1;36m" + bold_red: str = "\x1b[31;1m" + purple: str = "\x1b[38;5;105m" + blue: str = "\x1b[34m" + reset: str = "\x1b[0m" + + def _get_format_template(self, record: logging.LogRecord) -> str: + """Get the format template based on the logging level. - if handle_bool_env_var(ENV_ZENML_LOGGING_COLORS_DISABLED, False): - return message + Args: + record: The log record to format. - grey = "\x1b[90m" - white = "\x1b[37m" - yellow = "\x1b[33m" - red = "\x1b[31m" - bold_red = "\x1b[31;1m" - purple = "\x1b[38;5;105m" - blue = "\x1b[34m" - reset = "\x1b[0m" + Returns: + The format template string. + """ + if get_logging_level() == LoggingLevels.DEBUG: + return "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)" + else: + return "%(message)s" - COLORS = { + COLORS: Dict[LoggingLevels, str] = { LoggingLevels.DEBUG: grey, LoggingLevels.INFO: white, LoggingLevels.WARN: yellow, @@ -128,26 +125,66 @@ def format_console_message( LoggingLevels.CRITICAL: bold_red, } - level_color = COLORS.get(level, white) - - formatted_message = f"{level_color}{message}{reset}" + def format(self, record: logging.LogRecord) -> str: + """Converts a log record to a (colored) string. - quoted_groups = re.findall("`([^`]*)`", formatted_message) - for quoted in quoted_groups: - formatted_message = formatted_message.replace( - "`" + quoted + "`", - f"{reset}{purple}{quoted}{level_color}", - ) + Args: + record: LogRecord generated by the code. - url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" - urls = re.findall(url_pattern, formatted_message) - for url in urls: - formatted_message = formatted_message.replace( - url, - f"{reset}{blue}{url}{level_color}", + Returns: + A string formatted according to specifications. + """ + format_template = self._get_format_template(record) + + message = record.getMessage() + try: + if step_names_in_console.get(): + message = _add_step_name_to_message(message) + except Exception: + pass + + modified_record = logging.LogRecord( + name=record.name, + level=record.levelno, + pathname=record.pathname, + lineno=record.lineno, + msg=message, + args=(), + exc_info=record.exc_info, ) - return formatted_message + if ZENML_LOGGING_COLORS_DISABLED: + formatter = logging.Formatter(format_template) + return formatter.format(modified_record) + else: + log_fmt = ( + self.COLORS[LoggingLevels(record.levelno)] + + format_template + + self.reset + ) + formatter = logging.Formatter(log_fmt) + formatted_message = formatter.format(modified_record) + quoted_groups = re.findall("`([^`]*)`", formatted_message) + for quoted in quoted_groups: + formatted_message = formatted_message.replace( + "`" + quoted + "`", + self.reset + + self.purple + + quoted + + self.COLORS.get(LoggingLevels(record.levelno)), + ) + + url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" + urls = re.findall(url_pattern, formatted_message) + for url in urls: + formatted_message = formatted_message.replace( + url, + self.reset + + self.blue + + url + + self.COLORS.get(LoggingLevels(record.levelno)), + ) + return formatted_message def get_logging_level() -> LoggingLevels: @@ -184,35 +221,6 @@ def set_root_verbosity() -> None: get_logger(__name__).debug("Logging NOTSET") -class ZenMLFormatter(logging.Formatter): - """Formats logs according to custom specifications.""" - - def format(self, record: logging.LogRecord) -> str: - """Converts a log record to a (colored) string or structured JSON. - - Args: - record: LogRecord generated by the code. - - Returns: - A string formatted according to specifications. - """ - data = { - "zenml": True, - "timestamp": self.formatTime(record, datefmt="%Y-%m-%dT%H:%M:%S"), - "level": record.levelname, - "name": record.name, - "msg": record.getMessage(), - "module": record.module, - "filename": record.filename, - "lineno": record.lineno, - } - - if record.exc_info: - data["exc_info"] = self.formatException(record.exc_info) - - return json.dumps(data, ensure_ascii=False) - - def _wrapped_write(original_write: Any, stream_name: str) -> Any: """Wrap stdout/stderr write method to route logs to LoggingContext.""" @@ -278,6 +286,7 @@ def get_console_handler() -> logging.Handler: A console handler. """ handler = logging.StreamHandler(_original_stdout) + handler.setFormatter(ConsoleFormatter()) return handler diff --git a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py index da0ed866375..042bcfae561 100644 --- a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py +++ b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py @@ -1,7 +1,7 @@ """add log stores [5c0a1c787128]. Revision ID: 5c0a1c787128 -Revises: a5a17015b681 +Revises: 4dd9d3afd2c0 Create Date: 2025-10-24 10:06:54.402219 """ @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "5c0a1c787128" -down_revision = "a5a17015b681" +down_revision = "4dd9d3afd2c0" branch_labels = None depends_on = None From 997c23af634ba47827613b002900e8a47fed1b5e Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 18 Nov 2025 16:02:41 +0100 Subject: [PATCH 25/81] new changes --- src/zenml/log_stores/base_log_store.py | 29 +++++++------------------- src/zenml/logger.py | 1 - 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index 5bd2bd7de53..8940f7ae3b8 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -16,16 +16,14 @@ import logging from abc import abstractmethod from datetime import datetime -from typing import TYPE_CHECKING, List, Optional, Type, cast +from typing import List, Optional, Type, cast from zenml.enums import StackComponentType +from zenml.log_stores.utils import LogEntry +from zenml.logging.logging import LoggingContext +from zenml.models import LogsResponse from zenml.stack import Flavor, StackComponent, StackComponentConfig -if TYPE_CHECKING: - from zenml.log_stores.utils import LogEntry - from zenml.logging.logging import LoggingContext - from zenml.models import LogsResponse - # Maximum number of log entries to return in a single request MAX_ENTRIES_PER_REQUEST = 20000 # Maximum size of a single log message in bytes (5KB) @@ -57,41 +55,30 @@ def config(self) -> BaseLogStoreConfig: def emit( self, record: logging.LogRecord, - context: "LoggingContext", + context: LoggingContext, ) -> None: """Process a log record from the logging system. - This method is called by the ZenML logging system for each log - record that should be stored by this log store. Implementations - should process the record according to their backend's requirements. - Args: record: The Python logging.LogRecord to process. - context: The logging context containing the log_model with routing - metadata (pipeline_run_id, step_run_id, etc.). + context: The logging context containing the log_model. """ @abstractmethod def fetch( self, - logs_model: "LogsResponse", + logs_model: LogsResponse, start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: int = MAX_ENTRIES_PER_REQUEST, message_size: int = DEFAULT_MESSAGE_SIZE, - ) -> List["LogEntry"]: + ) -> List[LogEntry]: """Fetch logs from the log store. This method is called from the server to retrieve logs for display on the dashboard or via API. The implementation should not require any integration-specific SDKs that aren't available on the server. - Each log store implementation can extract the information it needs - from logs_model: - - ArtifactLogStore: uses logs_model.uri and logs_model.artifact_store_id - - OtelLogStore: uses logs_model.pipeline_run_id, step_run_id, source - - DatadogLogStore: uses logs_model.pipeline_run_id, step_run_id, source - Args: logs_model: The logs model containing metadata about the logs. start_time: Filter logs after this time. diff --git a/src/zenml/logger.py b/src/zenml/logger.py index b1f6ddb3626..392ca52420d 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -13,7 +13,6 @@ # permissions and limitations under the License. """Logger implementation.""" -import json import logging import os import re From 61802c79fb39c2e7540eafe9d02dc5a25b489695 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 18 Nov 2025 18:05:07 +0100 Subject: [PATCH 26/81] new stuff --- .../execution/pipeline/dynamic/runner.py | 33 +---- .../kubernetes_orchestrator_entrypoint.py | 42 +----- src/zenml/logging/logging.py | 121 +++++++++--------- src/zenml/orchestrators/step_launcher.py | 7 +- src/zenml/orchestrators/step_runner.py | 10 +- src/zenml/pipelines/pipeline_definition.py | 10 +- 6 files changed, 81 insertions(+), 142 deletions(-) diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index 3fc97a77777..4f41daab20d 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -16,7 +16,6 @@ import contextvars import inspect from concurrent.futures import ThreadPoolExecutor -from contextlib import nullcontext from typing import ( TYPE_CHECKING, Any, @@ -50,7 +49,9 @@ ) from zenml.execution.step.utils import launch_step from zenml.logger import get_logger -from zenml.logging.logging import setup_pipeline_logging +from zenml.logging.logging import ( + setup_orchestrator_logging, +) from zenml.models import ( ArtifactVersionResponse, PipelineRunResponse, @@ -149,37 +150,16 @@ def pipeline(self) -> "DynamicPipeline": def run_pipeline(self) -> None: """Run the pipeline.""" - from zenml.logging.logging import generate_logs_request - - # Generate logs request for orchestrator logging - logs_request = generate_logs_request(source="orchestrator") - with InMemoryArtifactCache(): run = self._run or create_placeholder_run( snapshot=self._snapshot, orchestrator_run_id=self._orchestrator_run_id, - logs=logs_request, ) - # Get logs response from the run and set up logging context - logs_response = run.logs - if not logs_response and run.log_collection: - for log in run.log_collection: - if log.source == "orchestrator": - logs_response = log - break - - logs_context = ( - setup_pipeline_logging( - snapshot=self._snapshot, - run_id=run.id, - logs_response=logs_response, - ) - if logs_response - else nullcontext() + logging_context = setup_orchestrator_logging( + pipeline_run=run, snapshot=self._snapshot ) - - with logs_context: + with logging_context: assert ( self._snapshot.pipeline_spec ) # Always exists for new snapshots @@ -193,7 +173,6 @@ def run_pipeline(self) -> None: ): self._orchestrator.run_init_hook(snapshot=self._snapshot) try: - # TODO: step logging isn't threadsafe # TODO: what should be allowed as pipeline returns? # (artifacts, json serializable, anything?) # how do we show it in the UI? diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index 640f6592c85..5ca097e3c65 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -18,7 +18,6 @@ import socket import threading import time -from contextlib import nullcontext from typing import List, Optional, Tuple, cast from uuid import UUID @@ -60,7 +59,6 @@ from zenml.logging.logging import setup_orchestrator_logging from zenml.models import ( PipelineRunResponse, - PipelineRunUpdate, PipelineSnapshotResponse, ) from zenml.orchestrators import publish_utils @@ -244,7 +242,6 @@ def main() -> None: namespace=namespace, job_name=job_name, ) - logs_response = None if run_id and orchestrator_run_id: logger.info("Continuing existing run `%s`.", run_id) @@ -257,43 +254,17 @@ def main() -> None: ) logger.debug("Reconstructed nodes: %s", nodes) - # Continue logging to the same log file if it exists - for log_response in pipeline_run.log_collection or []: - if log_response.source == "orchestrator": - logs_response = log_response - break else: orchestrator_run_id = orchestrator_pod_name - # Generate logs request for orchestrator logging - from zenml.logging.logging import generate_logs_request - - logs_request = generate_logs_request(source="orchestrator") - if args.run_id: - pipeline_run = client.zen_store.update_run( - run_id=args.run_id, - run_update=PipelineRunUpdate( - orchestrator_run_id=orchestrator_run_id, - add_logs=[logs_request], - ), - ) + pipeline_run = client.zen_store.get_pipeline_run(args.run_id) else: pipeline_run = create_placeholder_run( snapshot=snapshot, orchestrator_run_id=orchestrator_run_id, - logs=logs_request, ) - # Get logs_response from the created/updated run - if pipeline_run.logs: - logs_response = pipeline_run.logs - elif pipeline_run.log_collection: - for log_response in pipeline_run.log_collection: - if log_response.source == "orchestrator": - logs_response = log_response - break - # Store in the job annotations so we can continue the run if the pod # is restarted kube_utils.update_job( @@ -310,16 +281,9 @@ def main() -> None: for step_name, step in snapshot.step_configurations.items() ] - logs_context = ( - setup_orchestrator_logging( - run_id=pipeline_run.id, - snapshot=snapshot, - logs_response=logs_response, - ) - if logs_response - else nullcontext() + logs_context = setup_orchestrator_logging( + pipeline_run=pipeline_run, snapshot=snapshot, ) - with logs_context: step_command = StepEntrypointConfiguration.get_entrypoint_command() mount_local_stores = active_stack.orchestrator.config.is_local diff --git a/src/zenml/logging/logging.py b/src/zenml/logging/logging.py index 7e44a5f1d6a..6d31885a81e 100644 --- a/src/zenml/logging/logging.py +++ b/src/zenml/logging/logging.py @@ -15,18 +15,19 @@ import logging import threading -from contextlib import contextmanager, nullcontext +from contextlib import nullcontext from contextvars import ContextVar from types import TracebackType from typing import ( Any, - Generator, + List, Optional, Type, ) -from uuid import UUID, uuid4 +from uuid import uuid4 from zenml.client import Client +from zenml.config.pipeline_configurations import PipelineConfiguration from zenml.constants import ( ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, handle_bool_env_var, @@ -35,6 +36,8 @@ from zenml.models import ( LogsRequest, LogsResponse, + PipelineRunResponse, + PipelineRunUpdate, PipelineSnapshotResponse, ) @@ -150,11 +153,44 @@ def __exit__( active_logging_context.set(self._previous_context) -# TODO: Adjust the usage of this function +def is_logging_enabled(pipeline_configuration: PipelineConfiguration) -> bool: + """Check if logging is enabled for a pipeline configuration. + + Args: + pipeline_configuration: The pipeline configuration. + + Returns: + True if logging is enabled, False if disabled. + """ + if handle_bool_env_var(ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, False): + return False + elif pipeline_configuration.enable_pipeline_logs is not None: + return pipeline_configuration.enable_pipeline_logs + else: + return True + + +def search_logs_by_source( + logs_collection: List[LogsResponse], source: str +) -> Optional[LogsResponse]: + """Get the logs response for a given source. + + Args: + logs_collection: The logs collection. + source: The source of the logs. + + Returns: + The logs response for the given source. + """ + for log in logs_collection: + if log.source == source: + return log + return None + + def setup_orchestrator_logging( - run_id: UUID, + pipeline_run: "PipelineRunResponse", snapshot: "PipelineSnapshotResponse", - logs_response: LogsResponse, ) -> Any: """Set up logging for an orchestrator environment. @@ -162,64 +198,35 @@ def setup_orchestrator_logging( consistent logging behavior. Args: - run_id: The pipeline run ID. + pipeline_run: The pipeline run. snapshot: The snapshot of the pipeline run. - logs_response: The logs response for this orchestrator context. Returns: The logs context or nullcontext if logging is disabled. """ - try: - logging_enabled = True - - if handle_bool_env_var(ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, False): - logging_enabled = False - else: - if ( - snapshot.pipeline_configuration.enable_pipeline_logs - is not None - ): - logging_enabled = ( - snapshot.pipeline_configuration.enable_pipeline_logs - ) - - if not logging_enabled: - return nullcontext() - - return LoggingContext(log_model=logs_response) - except Exception as e: - logger.error( - f"Failed to setup orchestrator logging for run {run_id}: {e}" - ) - return nullcontext() + logging_enabled = is_logging_enabled(snapshot.pipeline_configuration) + if not logging_enabled: + return nullcontext() -# TODO: Adjust the usage of this function -@contextmanager -def setup_pipeline_logging( - snapshot: "PipelineSnapshotResponse", - run_id: UUID, - logs_response: LogsResponse, -) -> Generator[LogsResponse, None, None]: - """Set up logging for a pipeline run. + if orchestrator_logs := search_logs_by_source( + pipeline_run.log_collection, "orchestrator" + ): + return LoggingContext(log_model=orchestrator_logs) - Args: - snapshot: The snapshot of the pipeline run. - run_id: The ID of the pipeline run. - logs_response: The logs response for this pipeline context. - - Yields: - The logs response. - """ - logging_enabled = True + logs_request = generate_logs_request(source="orchestrator") + try: + client = Client() + run_update = PipelineRunUpdate(add_logs=[logs_request]) + pipeline_run = client.zen_store.update_run( + run_id=pipeline_run.id, run_update=run_update + ) + except Exception as e: + logger.error(f"Failed to add logs to the run {pipeline_run.id}: {e}") - if handle_bool_env_var(ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, False): - logging_enabled = False - elif snapshot.pipeline_configuration.enable_pipeline_logs is not None: - logging_enabled = snapshot.pipeline_configuration.enable_pipeline_logs + if orchestrator_logs := search_logs_by_source( + pipeline_run.log_collection, "orchestrator" + ): + return LoggingContext(log_model=orchestrator_logs) - if logging_enabled: - with LoggingContext(log_model=logs_response): - yield logs_response - else: - yield logs_response + return nullcontext() diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 5aaee7f0f6c..720c7f61851 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -15,6 +15,7 @@ import signal import time +from contextlib import nullcontext from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple from zenml.client import Client @@ -320,13 +321,9 @@ def launch(self) -> StepRunResponse: if not step_run.status.is_finished: logger.info(f"Step `{self._invocation_id}` has started.") + logs_context = nullcontext() if step_run.logs: logs_context = LoggingContext(log_model=step_run.logs) - else: - logger.debug( - "There is no LogsResponseModel prepared for the step. The" - "step logging storage is disabled." - ) with logs_context: try: diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index d40e5f7fb40..c72c283c105 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -150,14 +150,8 @@ def run( ) logs_context = nullcontext() - if step_logging_enabled: - if step_run.logs: - logs_context = LoggingContext(log_model=step_run.logs) - else: - logger.debug( - "There is no LogsResponseModel prepared for the step. The" - "step logging storage is disabled." - ) + if step_logging_enabled and step_run.logs: + logs_context = LoggingContext(log_model=step_run.logs) with logs_context: step_instance = self._load_step() diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index 86e9111b982..58ef29f56d5 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -64,7 +64,7 @@ ) from zenml.hooks.hook_validators import resolve_and_validate_hook from zenml.logger import get_logger -from zenml.logging.logging import LoggingContext +from zenml.logging.logging import LoggingContext, generate_logs_request from zenml.models import ( CodeReferenceRequest, DeploymentResponse, @@ -1043,15 +1043,11 @@ def _run( else True, ) - logs_context = nullcontext() - logs_request = None - snapshot = self._create_snapshot(**self._run_args) self.log_pipeline_snapshot_metadata(snapshot) + logs_request = None if logging_enabled: - from zenml.logging.logging import generate_logs_request - logs_request = generate_logs_request(source="client") run = ( @@ -1060,8 +1056,10 @@ def _run( else None ) + logs_context = nullcontext() if logging_enabled and run and run.logs: logs_context = LoggingContext(log_model=run.logs) + with logs_context: analytics_handler.metadata = ( self._get_pipeline_analytics_metadata( From d2234733abc61a8f987ed27f7289c13e85dd694d Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 18 Nov 2025 18:24:12 +0100 Subject: [PATCH 27/81] new defaults and formatting --- .../orchestrators/kubernetes_orchestrator_entrypoint.py | 3 ++- src/zenml/log_stores/otel/otel_flavor.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index 5ca097e3c65..9de9e4ba9ea 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -282,7 +282,8 @@ def main() -> None: ] logs_context = setup_orchestrator_logging( - pipeline_run=pipeline_run, snapshot=snapshot, + pipeline_run=pipeline_run, + snapshot=snapshot, ) with logs_context: step_command = StepEntrypointConfiguration.get_entrypoint_command() diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index 75887cbcf36..62912a42a4d 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -42,11 +42,11 @@ class OtelLogStoreConfig(BaseLogStoreConfig): description="Name of the service for telemetry", ) max_queue_size: int = Field( - default=2048, + default=8096, description="Maximum queue size for batch log processor", ) schedule_delay_millis: int = Field( - default=1000, + default=15000, description="Export interval in milliseconds", ) max_export_batch_size: int = Field( From 6798bf8292d57c774d029003984c7a5060e619e8 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Wed, 19 Nov 2025 15:09:41 +0100 Subject: [PATCH 28/81] moving stuff around --- .../execution/pipeline/dynamic/runner.py | 4 +- .../kubernetes_orchestrator_entrypoint.py | 2 +- src/zenml/log_stores/__init__.py | 16 ++ .../artifact/artifact_log_exporter.py | 2 +- src/zenml/log_stores/base_log_store.py | 3 +- src/zenml/log_stores/otel/otel_log_store.py | 3 +- src/zenml/log_stores/utils.py | 211 +++++++++++++++- src/zenml/logger.py | 4 +- src/zenml/logging/__init__.py | 28 --- src/zenml/logging/logging.py | 232 ------------------ src/zenml/orchestrators/step_launcher.py | 8 +- src/zenml/orchestrators/step_runner.py | 2 +- src/zenml/pipelines/pipeline_definition.py | 2 +- .../zen_server/routers/runs_endpoints.py | 6 +- 14 files changed, 242 insertions(+), 281 deletions(-) delete mode 100644 src/zenml/logging/__init__.py delete mode 100644 src/zenml/logging/logging.py diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index 4f41daab20d..e73862a3cf6 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -48,10 +48,8 @@ DynamicPipelineRunContext, ) from zenml.execution.step.utils import launch_step +from zenml.log_stores.utils import setup_orchestrator_logging from zenml.logger import get_logger -from zenml.logging.logging import ( - setup_orchestrator_logging, -) from zenml.models import ( ArtifactVersionResponse, PipelineRunResponse, diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index 9de9e4ba9ea..2eaa9176f9e 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -55,8 +55,8 @@ from zenml.integrations.kubernetes.orchestrators.kubernetes_orchestrator import ( KubernetesOrchestrator, ) +from zenml.log_stores.utils import setup_orchestrator_logging from zenml.logger import get_logger -from zenml.logging.logging import setup_orchestrator_logging from zenml.models import ( PipelineRunResponse, PipelineSnapshotResponse, diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index f288ef5d05a..a60498b8c46 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -45,6 +45,16 @@ DatadogLogStoreFlavor, ) +# Logging utilities +from zenml.log_stores.utils import ( + LogEntry, + LoggingContext, + generate_logs_request, + is_logging_enabled, + search_logs_by_source, + setup_orchestrator_logging, +) + __all__ = [ "ArtifactLogStore", "ArtifactLogStoreConfig", @@ -55,7 +65,13 @@ "DatadogLogStore", "DatadogLogStoreConfig", "DatadogLogStoreFlavor", + "LogEntry", + "LoggingContext", "OtelLogStore", "OtelLogStoreConfig", "OtelLogStoreFlavor", + "generate_logs_request", + "is_logging_enabled", + "search_logs_by_source", + "setup_orchestrator_logging", ] diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index b442c5a8d8d..03726f01f88 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -25,7 +25,7 @@ if TYPE_CHECKING: from opentelemetry.sdk._logs import LogData - from zenml.logging.logging import LoggingContext + from zenml.log_stores.utils import LoggingContext from zenml.artifacts.utils import _load_artifact_store from zenml.client import Client diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index 8940f7ae3b8..fb97d3ae8a0 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -19,8 +19,7 @@ from typing import List, Optional, Type, cast from zenml.enums import StackComponentType -from zenml.log_stores.utils import LogEntry -from zenml.logging.logging import LoggingContext +from zenml.log_stores.utils import LogEntry, LoggingContext from zenml.models import LogsResponse from zenml.stack import Flavor, StackComponent, StackComponentConfig diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 07c9693c39c..8ebc4605210 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -34,8 +34,7 @@ if TYPE_CHECKING: from opentelemetry.sdk._logs.export import LogExporter - from zenml.log_stores.utils import LogEntry - from zenml.logging.logging import LoggingContext + from zenml.log_stores.utils import LogEntry, LoggingContext logger = get_logger(__name__) diff --git a/src/zenml/log_stores/utils.py b/src/zenml/log_stores/utils.py index 45c85358a36..cb43750786d 100644 --- a/src/zenml/log_stores/utils.py +++ b/src/zenml/log_stores/utils.py @@ -13,13 +13,222 @@ # permissions and limitations under the License. """Utilities for log stores.""" +import logging +import threading +from contextlib import nullcontext +from contextvars import ContextVar from datetime import datetime -from typing import Optional +from types import TracebackType +from typing import Any, List, Optional, Type from uuid import UUID, uuid4 from pydantic import BaseModel, Field +from zenml.client import Client +from zenml.config.pipeline_configurations import PipelineConfiguration +from zenml.constants import ( + ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, + handle_bool_env_var, +) from zenml.enums import LoggingLevels +from zenml.logger import get_logger +from zenml.models import ( + LogsRequest, + LogsResponse, + PipelineRunResponse, + PipelineRunUpdate, + PipelineSnapshotResponse, +) + +logger = get_logger(__name__) + +# Active logging context +active_logging_context: ContextVar[Optional["LoggingContext"]] = ContextVar( + "active_logging_context", default=None +) + + +def generate_logs_request(source: str) -> LogsRequest: + """Generate a LogsRequest for logging. + + Args: + source: The source of the logs (e.g., "client", "orchestrator", "step"). + + Returns: + A LogsRequest object. + """ + from zenml.log_stores.artifact.artifact_log_store import ( + ArtifactLogStore, + prepare_logs_uri, + ) + + client = Client() + log_store = client.active_stack.log_store + log_id = uuid4() + + if isinstance(log_store, ArtifactLogStore): + artifact_store = client.active_stack.artifact_store + return LogsRequest( + id=log_id, + source=source, + uri=prepare_logs_uri( + artifact_store=artifact_store, + log_id=log_id, + ), + artifact_store_id=artifact_store.id, + ) + else: + return LogsRequest( + id=log_id, + source=source, + log_store_id=log_store.id if log_store else None, + ) + + +class LoggingContext: + """Context manager which collects logs using a LogStore.""" + + def __init__( + self, + log_model: LogsResponse, + ) -> None: + """Initialize the logging context. + + Args: + log_model: The logs response model for this context. + """ + self.log_model = log_model + self._lock = threading.Lock() + self._previous_context: Optional[LoggingContext] = None + + @classmethod + def emit(cls, record: logging.LogRecord) -> None: + """Emit a log record using the active logging context. + + This class method is called by stdout/stderr wrappers and logging + handlers to route logs to the active log store. + + Args: + record: The log record to emit. + """ + try: + if context := active_logging_context.get(): + Client().active_stack.log_store.emit(record, context) + except Exception: + pass + + def __enter__(self) -> "LoggingContext": + """Enter the context and set as active. + + Returns: + self + """ + with self._lock: + self._previous_context = active_logging_context.get() + active_logging_context.set(self) + + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> None: + """Exit the context and restore previous context. + + Args: + exc_type: The class of the exception. + exc_val: The instance of the exception. + exc_tb: The traceback of the exception. + """ + if exc_type is not None: + logger.error( + "An exception has occurred.", + exc_info=(exc_type, exc_val, exc_tb) if exc_val else None, + ) + + with self._lock: + active_logging_context.set(self._previous_context) + + +def is_logging_enabled(pipeline_configuration: PipelineConfiguration) -> bool: + """Check if logging is enabled for a pipeline configuration. + + Args: + pipeline_configuration: The pipeline configuration. + + Returns: + True if logging is enabled, False if disabled. + """ + if handle_bool_env_var(ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, False): + return False + elif pipeline_configuration.enable_pipeline_logs is not None: + return pipeline_configuration.enable_pipeline_logs + else: + return True + + +def search_logs_by_source( + logs_collection: List[LogsResponse], source: str +) -> Optional[LogsResponse]: + """Get the logs response for a given source. + + Args: + logs_collection: The logs collection. + source: The source of the logs. + + Returns: + The logs response for the given source. + """ + for log in logs_collection: + if log.source == source: + return log + return None + + +def setup_orchestrator_logging( + pipeline_run: "PipelineRunResponse", + snapshot: "PipelineSnapshotResponse", +) -> Any: + """Set up logging for an orchestrator environment. + + This function can be reused by different orchestrators to set up + consistent logging behavior. + + Args: + pipeline_run: The pipeline run. + snapshot: The snapshot of the pipeline run. + + Returns: + The logs context or nullcontext if logging is disabled. + """ + logging_enabled = is_logging_enabled(snapshot.pipeline_configuration) + + if not logging_enabled: + return nullcontext() + + if orchestrator_logs := search_logs_by_source( + pipeline_run.log_collection, "orchestrator" + ): + return LoggingContext(log_model=orchestrator_logs) + + logs_request = generate_logs_request(source="orchestrator") + try: + client = Client() + run_update = PipelineRunUpdate(add_logs=[logs_request]) + pipeline_run = client.zen_store.update_run( + run_id=pipeline_run.id, run_update=run_update + ) + except Exception as e: + logger.error(f"Failed to add logs to the run {pipeline_run.id}: {e}") + + if orchestrator_logs := search_logs_by_source( + pipeline_run.log_collection, "orchestrator" + ): + return LoggingContext(log_model=orchestrator_logs) + + return nullcontext() class LogEntry(BaseModel): diff --git a/src/zenml/logger.py b/src/zenml/logger.py index 392ca52420d..92ccd146c24 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -225,7 +225,7 @@ def _wrapped_write(original_write: Any, stream_name: str) -> Any: def wrapped_write(text: str) -> int: """Write method that routes logs through LoggingContext.""" - from zenml.logging.logging import LoggingContext + from zenml.log_stores.utils import LoggingContext level_int = logging.INFO if stream_name == "stdout" else logging.ERROR @@ -273,7 +273,7 @@ def emit(self, record: logging.LogRecord) -> None: Args: record: The log record to emit. """ - from zenml.logging.logging import LoggingContext + from zenml.log_stores.utils import LoggingContext LoggingContext.emit(record) diff --git a/src/zenml/logging/__init__.py b/src/zenml/logging/__init__.py deleted file mode 100644 index b38f8acd12a..00000000000 --- a/src/zenml/logging/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) ZenML GmbH 2023. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing -# permissions and limitations under the License. -"""ZenML logging module.""" - -from zenml.logging.logging import ( - LoggingContext, - generate_logs_request, - setup_orchestrator_logging, - setup_pipeline_logging, -) - -__all__ = [ - "LoggingContext", - "generate_logs_request", - "setup_orchestrator_logging", - "setup_pipeline_logging", -] diff --git a/src/zenml/logging/logging.py b/src/zenml/logging/logging.py deleted file mode 100644 index 6d31885a81e..00000000000 --- a/src/zenml/logging/logging.py +++ /dev/null @@ -1,232 +0,0 @@ -# Copyright (c) ZenML GmbH 2023. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing -# permissions and limitations under the License. -"""ZenML logging.""" - -import logging -import threading -from contextlib import nullcontext -from contextvars import ContextVar -from types import TracebackType -from typing import ( - Any, - List, - Optional, - Type, -) -from uuid import uuid4 - -from zenml.client import Client -from zenml.config.pipeline_configurations import PipelineConfiguration -from zenml.constants import ( - ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, - handle_bool_env_var, -) -from zenml.logger import get_logger -from zenml.models import ( - LogsRequest, - LogsResponse, - PipelineRunResponse, - PipelineRunUpdate, - PipelineSnapshotResponse, -) - -logger = get_logger(__name__) - -# Active logging context -active_logging_context: ContextVar[Optional["LoggingContext"]] = ContextVar( - "active_logging_context", default=None -) - - -def generate_logs_request(source: str) -> LogsRequest: - """Generate a LogsRequest for logging. - - Args: - source: The source of the logs (e.g., "client", "orchestrator", "step"). - - Returns: - A LogsRequest object. - """ - from zenml.log_stores.artifact.artifact_log_store import ( - ArtifactLogStore, - prepare_logs_uri, - ) - - client = Client() - log_store = client.active_stack.log_store - log_id = uuid4() - - if isinstance(log_store, ArtifactLogStore): - artifact_store = client.active_stack.artifact_store - return LogsRequest( - id=log_id, - source=source, - uri=prepare_logs_uri( - artifact_store=artifact_store, - log_id=log_id, - ), - artifact_store_id=artifact_store.id, - ) - else: - return LogsRequest( - id=log_id, - source=source, - log_store_id=log_store.id if log_store else None, - ) - - -class LoggingContext: - """Context manager which collects logs using a LogStore.""" - - def __init__( - self, - log_model: LogsResponse, - ) -> None: - """Initialize the logging context. - - Args: - log_model: The logs response model for this context. - """ - self.log_model = log_model - self._lock = threading.Lock() - self._previous_context: Optional[LoggingContext] = None - - @classmethod - def emit(cls, record: logging.LogRecord) -> None: - """Emit a log record using the active logging context. - - This class method is called by stdout/stderr wrappers and logging - handlers to route logs to the active log store. - - Args: - record: The log record to emit. - """ - try: - if context := active_logging_context.get(): - Client().active_stack.log_store.emit(record, context) - except Exception: - pass - - def __enter__(self) -> "LoggingContext": - """Enter the context and set as active. - - Returns: - self - """ - with self._lock: - self._previous_context = active_logging_context.get() - active_logging_context.set(self) - - return self - - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> None: - """Exit the context and restore previous context. - - Args: - exc_type: The class of the exception. - exc_val: The instance of the exception. - exc_tb: The traceback of the exception. - """ - if exc_type is not None: - logger.error( - "An exception has occurred.", - exc_info=(exc_type, exc_val, exc_tb) if exc_val else None, - ) - - with self._lock: - active_logging_context.set(self._previous_context) - - -def is_logging_enabled(pipeline_configuration: PipelineConfiguration) -> bool: - """Check if logging is enabled for a pipeline configuration. - - Args: - pipeline_configuration: The pipeline configuration. - - Returns: - True if logging is enabled, False if disabled. - """ - if handle_bool_env_var(ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, False): - return False - elif pipeline_configuration.enable_pipeline_logs is not None: - return pipeline_configuration.enable_pipeline_logs - else: - return True - - -def search_logs_by_source( - logs_collection: List[LogsResponse], source: str -) -> Optional[LogsResponse]: - """Get the logs response for a given source. - - Args: - logs_collection: The logs collection. - source: The source of the logs. - - Returns: - The logs response for the given source. - """ - for log in logs_collection: - if log.source == source: - return log - return None - - -def setup_orchestrator_logging( - pipeline_run: "PipelineRunResponse", - snapshot: "PipelineSnapshotResponse", -) -> Any: - """Set up logging for an orchestrator environment. - - This function can be reused by different orchestrators to set up - consistent logging behavior. - - Args: - pipeline_run: The pipeline run. - snapshot: The snapshot of the pipeline run. - - Returns: - The logs context or nullcontext if logging is disabled. - """ - logging_enabled = is_logging_enabled(snapshot.pipeline_configuration) - - if not logging_enabled: - return nullcontext() - - if orchestrator_logs := search_logs_by_source( - pipeline_run.log_collection, "orchestrator" - ): - return LoggingContext(log_model=orchestrator_logs) - - logs_request = generate_logs_request(source="orchestrator") - try: - client = Client() - run_update = PipelineRunUpdate(add_logs=[logs_request]) - pipeline_run = client.zen_store.update_run( - run_id=pipeline_run.id, run_update=run_update - ) - except Exception as e: - logger.error(f"Failed to add logs to the run {pipeline_run.id}: {e}") - - if orchestrator_logs := search_logs_by_source( - pipeline_run.log_collection, "orchestrator" - ): - return LoggingContext(log_model=orchestrator_logs) - - return nullcontext() diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 720c7f61851..fd6c932af3b 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -29,9 +29,11 @@ from zenml.enums import ExecutionMode, ExecutionStatus, StepRuntime from zenml.environment import get_run_environment_dict from zenml.exceptions import RunInterruptedException, RunStoppedException +from zenml.log_stores.utils import ( + LoggingContext, + generate_logs_request, +) from zenml.logger import get_logger -from zenml.logging import LoggingContext -from zenml.logging import logging as zenml_logging from zenml.models import ( PipelineRunRequest, PipelineRunResponse, @@ -273,7 +275,7 @@ def launch(self) -> StepRunResponse: logs_request = None if step_logging_enabled: - logs_request = zenml_logging.generate_logs_request(source="step") + logs_request = generate_logs_request(source="step") if run_was_created: pipeline_run_metadata = self._stack.get_pipeline_run_metadata( diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index c72c283c105..9185cf04f75 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -40,8 +40,8 @@ from zenml.enums import ArtifactSaveType from zenml.exceptions import StepInterfaceError from zenml.hooks.hook_validators import load_and_run_hook +from zenml.log_stores.utils import LoggingContext from zenml.logger import get_logger -from zenml.logging.logging import LoggingContext from zenml.materializers.base_materializer import BaseMaterializer from zenml.materializers.in_memory_materializer import InMemoryMaterializer from zenml.models.v2.core.step_run import ( diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index 58ef29f56d5..4ba735d2f9a 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -63,8 +63,8 @@ submit_pipeline, ) from zenml.hooks.hook_validators import resolve_and_validate_hook +from zenml.log_stores.utils import LoggingContext, generate_logs_request from zenml.logger import get_logger -from zenml.logging.logging import LoggingContext, generate_logs_request from zenml.models import ( CodeReferenceRequest, DeploymentResponse, diff --git a/src/zenml/zen_server/routers/runs_endpoints.py b/src/zenml/zen_server/routers/runs_endpoints.py index 3182ef29f29..33c2c099b6e 100644 --- a/src/zenml/zen_server/routers/runs_endpoints.py +++ b/src/zenml/zen_server/routers/runs_endpoints.py @@ -31,11 +31,9 @@ ) from zenml.enums import ExecutionStatus from zenml.log_stores import fetch_logs +from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST +from zenml.log_stores.utils import LogEntry from zenml.logger import get_logger -from zenml.logging.logging import ( - MAX_ENTRIES_PER_REQUEST, - LogEntry, -) from zenml.models import ( Page, PipelineRunDAG, From 02a724aa37833e4abd40f26fa0df1119ce78a90c Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Wed, 19 Nov 2025 15:09:49 +0100 Subject: [PATCH 29/81] adding the dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 4f65296e7c5..2eda15f5503 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "docker~=7.1.0", "gitpython>=3.1.18,<4.0.0", "jsonref", + "opentelemetry-sdk>=1.0,<=1.38.0", "packaging>=24.1", "psutil>=5.0.0", "pydantic>=2.0,<=2.11.9", From ddd54bb4d3da9a6919cfd7301590fbfc00563511 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 11:01:13 +0100 Subject: [PATCH 30/81] fixing the migration --- .../migrations/versions/5c0a1c787128_add_log_stores.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py index 042bcfae561..3e2585583ed 100644 --- a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py +++ b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py @@ -1,7 +1,7 @@ """add log stores [5c0a1c787128]. Revision ID: 5c0a1c787128 -Revises: 4dd9d3afd2c0 +Revises: 0.91.2 Create Date: 2025-10-24 10:06:54.402219 """ @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "5c0a1c787128" -down_revision = "4dd9d3afd2c0" +down_revision = "0.91.2" branch_labels = None depends_on = None From 854e96a1985a1bfb73a8b74cfbf5dfdbc2f1bf2a Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 17:01:02 +0100 Subject: [PATCH 31/81] new changes --- src/zenml/log_stores/__init__.py | 31 ++++++++--------- src/zenml/log_stores/utils.py | 57 +++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 16 deletions(-) diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index a60498b8c46..d4206423299 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -14,12 +14,6 @@ """Implements the log stores for ZenML.""" # Base log store -from zenml.log_stores.base_log_store import ( - BaseLogStore, - BaseLogStoreConfig, - BaseLogStoreFlavor, -) - # Artifact log store from zenml.log_stores.artifact.artifact_log_store import ( ArtifactLogStore, @@ -28,27 +22,33 @@ ArtifactLogStoreConfig, ArtifactLogStoreFlavor, ) - -# OpenTelemetry log store -from zenml.log_stores.otel.otel_log_store import OtelLogStore -from zenml.log_stores.otel.otel_flavor import ( - OtelLogStoreConfig, - OtelLogStoreFlavor, +from zenml.log_stores.base_log_store import ( + BaseLogStore, + BaseLogStoreConfig, + BaseLogStoreFlavor, +) +from zenml.log_stores.datadog.datadog_flavor import ( + DatadogLogStoreConfig, + DatadogLogStoreFlavor, ) # Datadog log store from zenml.log_stores.datadog.datadog_log_store import ( DatadogLogStore, ) -from zenml.log_stores.datadog.datadog_flavor import ( - DatadogLogStoreConfig, - DatadogLogStoreFlavor, +from zenml.log_stores.otel.otel_flavor import ( + OtelLogStoreConfig, + OtelLogStoreFlavor, ) +# OpenTelemetry log store +from zenml.log_stores.otel.otel_log_store import OtelLogStore + # Logging utilities from zenml.log_stores.utils import ( LogEntry, LoggingContext, + fetch_logs, generate_logs_request, is_logging_enabled, search_logs_by_source, @@ -70,6 +70,7 @@ "OtelLogStore", "OtelLogStoreConfig", "OtelLogStoreFlavor", + "fetch_logs", "generate_logs_request", "is_logging_enabled", "search_logs_by_source", diff --git a/src/zenml/log_stores/utils.py b/src/zenml/log_stores/utils.py index cb43750786d..d5bac851c33 100644 --- a/src/zenml/log_stores/utils.py +++ b/src/zenml/log_stores/utils.py @@ -19,9 +19,11 @@ from contextvars import ContextVar from datetime import datetime from types import TracebackType -from typing import Any, List, Optional, Type +from typing import TYPE_CHECKING, Any, List, Optional, Type from uuid import UUID, uuid4 +from zenml.enums import StackComponentType +from zenml.log_stores.base_log_store import BaseLogStore from pydantic import BaseModel, Field from zenml.client import Client @@ -39,6 +41,10 @@ PipelineRunUpdate, PipelineSnapshotResponse, ) +from zenml.utils.time_utils import utc_now + +if TYPE_CHECKING: + from zenml.zen_stores.base_zen_store import BaseZenStore logger = get_logger(__name__) @@ -231,6 +237,55 @@ def setup_orchestrator_logging( return nullcontext() +def fetch_logs( + logs: LogsResponse, + zen_store: "BaseZenStore", + limit: int, +) -> List["LogEntry"]: + """Fetch logs from the log store. + + This function is designed to be called from the server side where we can't + always instantiate the full Stack object due to missing integration dependencies. + Instead, it directly instantiates the appropriate log store based on the logs model. + + Args: + logs: The logs response model containing metadata about the logs. + zen_store: The zen store instance. + limit: Maximum number of log entries to return. + + Returns: + List of log entries. + """ + log_store: Optional[BaseLogStore] = None + + if logs.log_store_id: + stack_component_response = zen_store.get_stack_component( + logs.log_store_id + ) + log_store = BaseLogStore.from_model(stack_component_response) + else: + from zenml.log_stores.artifact.artifact_log_store import ( + ArtifactLogStore, + ) + from zenml.log_stores.artifact.artifact_log_store_flavor import ( + ArtifactLogStoreConfig, + ) + + current_time = utc_now() + log_store = ArtifactLogStore( + name="default_artifact_log_store", + id=uuid4(), + config=ArtifactLogStoreConfig(), + flavor="artifact", + type=StackComponentType.LOG_STORE, + user=uuid4(), + created=current_time, + updated=current_time, + ) + + return log_store.fetch(logs_model=logs, limit=limit) + + class LogEntry(BaseModel): """A structured log entry with parsed information.""" From 33ea2f2e85a1ad97384c85381172c7a9463c8bdb Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 17:01:19 +0100 Subject: [PATCH 32/81] formating --- src/zenml/log_stores/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/zenml/log_stores/utils.py b/src/zenml/log_stores/utils.py index d5bac851c33..193ede3d989 100644 --- a/src/zenml/log_stores/utils.py +++ b/src/zenml/log_stores/utils.py @@ -22,8 +22,6 @@ from typing import TYPE_CHECKING, Any, List, Optional, Type from uuid import UUID, uuid4 -from zenml.enums import StackComponentType -from zenml.log_stores.base_log_store import BaseLogStore from pydantic import BaseModel, Field from zenml.client import Client @@ -32,7 +30,8 @@ ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, handle_bool_env_var, ) -from zenml.enums import LoggingLevels +from zenml.enums import LoggingLevels, StackComponentType +from zenml.log_stores.base_log_store import BaseLogStore from zenml.logger import get_logger from zenml.models import ( LogsRequest, From 0a64385eb2f2da6a5e46ee03449b1696ada4d3a0 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 17:09:15 +0100 Subject: [PATCH 33/81] new creation --- src/zenml/log_stores/utils.py | 41 ++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/src/zenml/log_stores/utils.py b/src/zenml/log_stores/utils.py index 193ede3d989..71315178ba9 100644 --- a/src/zenml/log_stores/utils.py +++ b/src/zenml/log_stores/utils.py @@ -30,8 +30,7 @@ ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, handle_bool_env_var, ) -from zenml.enums import LoggingLevels, StackComponentType -from zenml.log_stores.base_log_store import BaseLogStore +from zenml.enums import LoggingLevels from zenml.logger import get_logger from zenml.models import ( LogsRequest, @@ -254,14 +253,44 @@ def fetch_logs( Returns: List of log entries. + + Raises: + DoesNotExistException: If the log store doesn't exist or is not the right type. + NotImplementedError: If the log store's dependencies are not installed. """ + from typing import cast + + from zenml.enums import StackComponentType + from zenml.exceptions import DoesNotExistException + from zenml.log_stores.base_log_store import BaseLogStore + from zenml.stack import StackComponent + log_store: Optional[BaseLogStore] = None if logs.log_store_id: - stack_component_response = zen_store.get_stack_component( - logs.log_store_id - ) - log_store = BaseLogStore.from_model(stack_component_response) + try: + log_store_model = zen_store.get_stack_component(logs.log_store_id) + except KeyError: + raise DoesNotExistException( + f"Log store '{logs.log_store_id}' does not exist." + ) + + if not log_store_model.type == StackComponentType.LOG_STORE: + raise DoesNotExistException( + f"Stack component '{logs.log_store_id}' is not a log store." + ) + + try: + log_store = cast( + BaseLogStore, + StackComponent.from_model(log_store_model), + ) + except ImportError: + raise NotImplementedError( + f"Log store '{log_store_model.name}' could not be " + f"instantiated. This is likely because the log store's " + f"dependencies are not installed." + ) else: from zenml.log_stores.artifact.artifact_log_store import ( ArtifactLogStore, From 6a2d9d685ccb4c6cdd86f6f5c7563f42245801f3 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 17:41:28 +0100 Subject: [PATCH 34/81] some new import structure --- .../execution/pipeline/dynamic/runner.py | 2 +- .../kubernetes_orchestrator_entrypoint.py | 2 +- src/zenml/log_stores/__init__.py | 2 +- .../artifact/artifact_log_exporter.py | 4 +- .../log_stores/artifact/artifact_log_store.py | 2 +- src/zenml/log_stores/base_log_store.py | 2 +- .../log_stores/datadog/datadog_log_store.py | 2 +- src/zenml/log_stores/otel/otel_log_store.py | 2 +- src/zenml/logger.py | 4 +- src/zenml/orchestrators/step_launcher.py | 8 +- src/zenml/orchestrators/step_runner.py | 2 +- src/zenml/pipelines/pipeline_definition.py | 2 +- .../utils.py => utils/logging_utils.py} | 169 +++++++++--------- .../zen_server/routers/runs_endpoints.py | 3 +- .../zen_server/routers/steps_endpoints.py | 3 +- 15 files changed, 105 insertions(+), 104 deletions(-) rename src/zenml/{log_stores/utils.py => utils/logging_utils.py} (94%) diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index 5d5751afda8..fbb0b69e7ef 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -48,7 +48,6 @@ DynamicPipelineRunContext, ) from zenml.execution.step.utils import launch_step -from zenml.log_stores.utils import setup_orchestrator_logging from zenml.logger import get_logger from zenml.models import ( ArtifactVersionResponse, @@ -65,6 +64,7 @@ from zenml.steps.entrypoint_function_utils import StepArtifact from zenml.steps.utils import OutputSignature from zenml.utils import source_utils +from zenml.utils.logging_utils import setup_orchestrator_logging if TYPE_CHECKING: from zenml.config import DockerSettings diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index 67d139fb2b4..ef793db024a 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -55,7 +55,6 @@ from zenml.integrations.kubernetes.orchestrators.kubernetes_orchestrator import ( KubernetesOrchestrator, ) -from zenml.log_stores.utils import setup_orchestrator_logging from zenml.logger import get_logger from zenml.models import ( PipelineRunResponse, @@ -72,6 +71,7 @@ ) from zenml.pipelines.run_utils import create_placeholder_run from zenml.utils import env_utils +from zenml.utils.logging_utils import setup_orchestrator_logging logger = get_logger(__name__) diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index d4206423299..52bf07e7248 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -45,7 +45,7 @@ from zenml.log_stores.otel.otel_log_store import OtelLogStore # Logging utilities -from zenml.log_stores.utils import ( +from zenml.utils.logging_utils import ( LogEntry, LoggingContext, fetch_logs, diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 03726f01f88..639e0d2089a 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -25,7 +25,7 @@ if TYPE_CHECKING: from opentelemetry.sdk._logs import LogData - from zenml.log_stores.utils import LoggingContext + from zenml.utils.logging_utils import LoggingContext from zenml.artifacts.utils import _load_artifact_store from zenml.client import Client @@ -35,8 +35,8 @@ ) from zenml.log_stores.base_log_store import DEFAULT_MESSAGE_SIZE from zenml.log_stores.otel.otel_log_store import LOGGING_CONTEXT_KEY -from zenml.log_stores.utils import LogEntry from zenml.logger import get_logger +from zenml.utils.logging_utils import LogEntry from zenml.utils.time_utils import utc_now logger = get_logger(__name__) diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index 261b2ffa0d3..4abd7ae68ff 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -37,10 +37,10 @@ ) from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST from zenml.log_stores.otel.otel_log_store import OtelLogStore -from zenml.log_stores.utils import LogEntry from zenml.logger import get_logger from zenml.models import LogsResponse from zenml.utils.io_utils import sanitize_remote_path +from zenml.utils.logging_utils import LogEntry from zenml.zen_stores.base_zen_store import BaseZenStore ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index fb97d3ae8a0..025c53ef155 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -19,9 +19,9 @@ from typing import List, Optional, Type, cast from zenml.enums import StackComponentType -from zenml.log_stores.utils import LogEntry, LoggingContext from zenml.models import LogsResponse from zenml.stack import Flavor, StackComponent, StackComponentConfig +from zenml.utils.logging_utils import LogEntry, LoggingContext # Maximum number of log entries to return in a single request MAX_ENTRIES_PER_REQUEST = 20000 diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index cb442b3cd6d..9b8f8c7da9b 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -22,9 +22,9 @@ from zenml.enums import LoggingLevels from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreConfig from zenml.log_stores.otel.otel_log_store import OtelLogStore -from zenml.log_stores.utils import LogEntry from zenml.logger import get_logger from zenml.models import LogsResponse +from zenml.utils.logging_utils import LogEntry logger = get_logger(__name__) diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 8ebc4605210..fdb8153c03a 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -34,7 +34,7 @@ if TYPE_CHECKING: from opentelemetry.sdk._logs.export import LogExporter - from zenml.log_stores.utils import LogEntry, LoggingContext + from zenml.utils.logging_utils import LogEntry, LoggingContext logger = get_logger(__name__) diff --git a/src/zenml/logger.py b/src/zenml/logger.py index 92ccd146c24..0a732ee3c2f 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -225,7 +225,7 @@ def _wrapped_write(original_write: Any, stream_name: str) -> Any: def wrapped_write(text: str) -> int: """Write method that routes logs through LoggingContext.""" - from zenml.log_stores.utils import LoggingContext + from zenml.utils.logging_utils import LoggingContext level_int = logging.INFO if stream_name == "stdout" else logging.ERROR @@ -273,7 +273,7 @@ def emit(self, record: logging.LogRecord) -> None: Args: record: The log record to emit. """ - from zenml.log_stores.utils import LoggingContext + from zenml.utils.logging_utils import LoggingContext LoggingContext.emit(record) diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index a65be149027..f8c3399d707 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -29,10 +29,6 @@ from zenml.enums import ExecutionMode, ExecutionStatus, StepRuntime from zenml.environment import get_run_environment_dict from zenml.exceptions import RunInterruptedException, RunStoppedException -from zenml.log_stores.utils import ( - LoggingContext, - generate_logs_request, -) from zenml.logger import get_logger from zenml.models import ( PipelineRunRequest, @@ -47,6 +43,10 @@ from zenml.stack import Stack from zenml.steps import StepHeartBeatTerminationException, StepHeartbeatWorker from zenml.utils import env_utils, exception_utils, string_utils +from zenml.utils.logging_utils import ( + LoggingContext, + generate_logs_request, +) from zenml.utils.time_utils import utc_now if TYPE_CHECKING: diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index 9185cf04f75..5b576671c80 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -40,7 +40,6 @@ from zenml.enums import ArtifactSaveType from zenml.exceptions import StepInterfaceError from zenml.hooks.hook_validators import load_and_run_hook -from zenml.log_stores.utils import LoggingContext from zenml.logger import get_logger from zenml.materializers.base_materializer import BaseMaterializer from zenml.materializers.in_memory_materializer import InMemoryMaterializer @@ -73,6 +72,7 @@ string_utils, tag_utils, ) +from zenml.utils.logging_utils import LoggingContext from zenml.utils.typing_utils import get_origin, is_union if TYPE_CHECKING: diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index 4ba735d2f9a..59e287c642b 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -63,7 +63,6 @@ submit_pipeline, ) from zenml.hooks.hook_validators import resolve_and_validate_hook -from zenml.log_stores.utils import LoggingContext, generate_logs_request from zenml.logger import get_logger from zenml.models import ( CodeReferenceRequest, @@ -101,6 +100,7 @@ source_utils, yaml_utils, ) +from zenml.utils.logging_utils import LoggingContext, generate_logs_request from zenml.utils.string_utils import format_name_template from zenml.utils.tag_utils import Tag diff --git a/src/zenml/log_stores/utils.py b/src/zenml/utils/logging_utils.py similarity index 94% rename from src/zenml/log_stores/utils.py rename to src/zenml/utils/logging_utils.py index 71315178ba9..8765139ef1a 100644 --- a/src/zenml/log_stores/utils.py +++ b/src/zenml/utils/logging_utils.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. -"""Utilities for log stores.""" +"""Utility functions for logging.""" import logging import threading @@ -19,7 +19,7 @@ from contextvars import ContextVar from datetime import datetime from types import TracebackType -from typing import TYPE_CHECKING, Any, List, Optional, Type +from typing import TYPE_CHECKING, Any, List, Optional, Type, cast from uuid import UUID, uuid4 from pydantic import BaseModel, Field @@ -30,7 +30,8 @@ ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, handle_bool_env_var, ) -from zenml.enums import LoggingLevels +from zenml.enums import LoggingLevels, StackComponentType +from zenml.exceptions import DoesNotExistException from zenml.logger import get_logger from zenml.models import ( LogsRequest, @@ -39,6 +40,7 @@ PipelineRunUpdate, PipelineSnapshotResponse, ) +from zenml.stack import StackComponent from zenml.utils.time_utils import utc_now if TYPE_CHECKING: @@ -46,47 +48,56 @@ logger = get_logger(__name__) -# Active logging context + active_logging_context: ContextVar[Optional["LoggingContext"]] = ContextVar( "active_logging_context", default=None ) -def generate_logs_request(source: str) -> LogsRequest: - """Generate a LogsRequest for logging. - - Args: - source: The source of the logs (e.g., "client", "orchestrator", "step"). +class LogEntry(BaseModel): + """A structured log entry with parsed information. - Returns: - A LogsRequest object. + This is used in two distinct ways: + 1. If we are using the artifact log store, we save the + entries as JSON-serialized LogEntry's in the artifact store. + 2. When queried, the server returns logs as a list of LogEntry's. """ - from zenml.log_stores.artifact.artifact_log_store import ( - ArtifactLogStore, - prepare_logs_uri, - ) - client = Client() - log_store = client.active_stack.log_store - log_id = uuid4() - - if isinstance(log_store, ArtifactLogStore): - artifact_store = client.active_stack.artifact_store - return LogsRequest( - id=log_id, - source=source, - uri=prepare_logs_uri( - artifact_store=artifact_store, - log_id=log_id, - ), - artifact_store_id=artifact_store.id, - ) - else: - return LogsRequest( - id=log_id, - source=source, - log_store_id=log_store.id if log_store else None, - ) + message: str = Field(description="The log message content") + name: Optional[str] = Field( + default=None, + description="The name of the logger", + ) + level: Optional[LoggingLevels] = Field( + default=None, + description="The log level", + ) + timestamp: Optional[datetime] = Field( + default=None, + description="When the log was created", + ) + module: Optional[str] = Field( + default=None, description="The module that generated this log entry" + ) + filename: Optional[str] = Field( + default=None, + description="The name of the file that generated this log entry", + ) + lineno: Optional[int] = Field( + default=None, description="The fileno that generated this log entry" + ) + chunk_index: int = Field( + default=0, + description="The index of the chunk in the log entry", + ) + total_chunks: int = Field( + default=1, + description="The total number of chunks in the log entry", + ) + id: UUID = Field( + default_factory=uuid4, + description="The unique identifier of the log entry", + ) class LoggingContext: @@ -94,7 +105,7 @@ class LoggingContext: def __init__( self, - log_model: LogsResponse, + log_model: "LogsResponse", ) -> None: """Initialize the logging context. @@ -156,6 +167,43 @@ def __exit__( active_logging_context.set(self._previous_context) +def generate_logs_request(source: str) -> LogsRequest: + """Generate a LogsRequest for logging. + + Args: + source: The source of the logs (e.g., "client", "orchestrator", "step"). + + Returns: + A LogsRequest object. + """ + from zenml.log_stores.artifact.artifact_log_store import ( + ArtifactLogStore, + prepare_logs_uri, + ) + + client = Client() + log_store = client.active_stack.log_store + log_id = uuid4() + + if isinstance(log_store, ArtifactLogStore): + artifact_store = client.active_stack.artifact_store + return LogsRequest( + id=log_id, + source=source, + uri=prepare_logs_uri( + artifact_store=artifact_store, + log_id=log_id, + ), + artifact_store_id=artifact_store.id, + ) + else: + return LogsRequest( + id=log_id, + source=source, + log_store_id=log_store.id if log_store else None, + ) + + def is_logging_enabled(pipeline_configuration: PipelineConfiguration) -> bool: """Check if logging is enabled for a pipeline configuration. @@ -236,7 +284,7 @@ def setup_orchestrator_logging( def fetch_logs( - logs: LogsResponse, + logs: "LogsResponse", zen_store: "BaseZenStore", limit: int, ) -> List["LogEntry"]: @@ -258,12 +306,7 @@ def fetch_logs( DoesNotExistException: If the log store doesn't exist or is not the right type. NotImplementedError: If the log store's dependencies are not installed. """ - from typing import cast - - from zenml.enums import StackComponentType - from zenml.exceptions import DoesNotExistException from zenml.log_stores.base_log_store import BaseLogStore - from zenml.stack import StackComponent log_store: Optional[BaseLogStore] = None @@ -312,43 +355,3 @@ def fetch_logs( ) return log_store.fetch(logs_model=logs, limit=limit) - - -class LogEntry(BaseModel): - """A structured log entry with parsed information.""" - - message: str = Field(description="The log message content") - name: Optional[str] = Field( - default=None, - description="The name of the logger", - ) - level: Optional[LoggingLevels] = Field( - default=None, - description="The log level", - ) - timestamp: Optional[datetime] = Field( - default=None, - description="When the log was created", - ) - module: Optional[str] = Field( - default=None, description="The module that generated this log entry" - ) - filename: Optional[str] = Field( - default=None, - description="The name of the file that generated this log entry", - ) - lineno: Optional[int] = Field( - default=None, description="The fileno that generated this log entry" - ) - chunk_index: int = Field( - default=0, - description="The index of the chunk in the log entry", - ) - total_chunks: int = Field( - default=1, - description="The total number of chunks in the log entry", - ) - id: UUID = Field( - default_factory=uuid4, - description="The unique identifier of the log entry", - ) diff --git a/src/zenml/zen_server/routers/runs_endpoints.py b/src/zenml/zen_server/routers/runs_endpoints.py index 33c2c099b6e..053a325f0bf 100644 --- a/src/zenml/zen_server/routers/runs_endpoints.py +++ b/src/zenml/zen_server/routers/runs_endpoints.py @@ -30,9 +30,7 @@ VERSION_1, ) from zenml.enums import ExecutionStatus -from zenml.log_stores import fetch_logs from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST -from zenml.log_stores.utils import LogEntry from zenml.logger import get_logger from zenml.models import ( Page, @@ -45,6 +43,7 @@ StepRunResponse, ) from zenml.utils import run_utils +from zenml.utils.logging_utils import LogEntry, fetch_logs from zenml.zen_server.auth import ( AuthContext, authorize, diff --git a/src/zenml/zen_server/routers/steps_endpoints.py b/src/zenml/zen_server/routers/steps_endpoints.py index 7e72e885e4c..db499a47ffc 100644 --- a/src/zenml/zen_server/routers/steps_endpoints.py +++ b/src/zenml/zen_server/routers/steps_endpoints.py @@ -29,9 +29,7 @@ ) from zenml.enums import ExecutionStatus from zenml.exceptions import AuthorizationException -from zenml.log_stores import fetch_logs from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST -from zenml.log_stores.utils import LogEntry from zenml.models import ( Page, StepRunFilter, @@ -40,6 +38,7 @@ StepRunUpdate, ) from zenml.models.v2.core.step_run import StepHeartbeatResponse +from zenml.utils.logging_utils import LogEntry, fetch_logs from zenml.zen_server.auth import ( AuthContext, authorize, From dba0a01e97940a4b700df3c9f9e38f20572406dc Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 18:02:50 +0100 Subject: [PATCH 35/81] fixing the order --- src/zenml/log_stores/__init__.py | 49 +++++++++++--------------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index 52bf07e7248..024eae29d88 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -13,7 +13,20 @@ # permissions and limitations under the License. """Implements the log stores for ZenML.""" -# Base log store +# Base classes +from zenml.log_stores.base_log_store import ( + BaseLogStore, + BaseLogStoreConfig, + BaseLogStoreFlavor, +) + +# OpenTelemetry log store +from zenml.log_stores.otel.otel_flavor import ( + OtelLogStoreConfig, + OtelLogStoreFlavor, +) +from zenml.log_stores.otel.otel_log_store import OtelLogStore + # Artifact log store from zenml.log_stores.artifact.artifact_log_store import ( ArtifactLogStore, @@ -22,38 +35,15 @@ ArtifactLogStoreConfig, ArtifactLogStoreFlavor, ) -from zenml.log_stores.base_log_store import ( - BaseLogStore, - BaseLogStoreConfig, - BaseLogStoreFlavor, -) + +# Datadog log store from zenml.log_stores.datadog.datadog_flavor import ( DatadogLogStoreConfig, DatadogLogStoreFlavor, ) - -# Datadog log store from zenml.log_stores.datadog.datadog_log_store import ( DatadogLogStore, ) -from zenml.log_stores.otel.otel_flavor import ( - OtelLogStoreConfig, - OtelLogStoreFlavor, -) - -# OpenTelemetry log store -from zenml.log_stores.otel.otel_log_store import OtelLogStore - -# Logging utilities -from zenml.utils.logging_utils import ( - LogEntry, - LoggingContext, - fetch_logs, - generate_logs_request, - is_logging_enabled, - search_logs_by_source, - setup_orchestrator_logging, -) __all__ = [ "ArtifactLogStore", @@ -65,14 +55,7 @@ "DatadogLogStore", "DatadogLogStoreConfig", "DatadogLogStoreFlavor", - "LogEntry", - "LoggingContext", "OtelLogStore", "OtelLogStoreConfig", "OtelLogStoreFlavor", - "fetch_logs", - "generate_logs_request", - "is_logging_enabled", - "search_logs_by_source", - "setup_orchestrator_logging", ] From 017253aca1cde7ef6031aab037edfb1781042f49 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 18:22:44 +0100 Subject: [PATCH 36/81] some minor changes --- .../artifact/artifact_log_exporter.py | 15 +++++--------- .../log_stores/artifact/artifact_log_store.py | 20 +++++++------------ src/zenml/log_stores/otel/otel_flavor.py | 5 ----- 3 files changed, 12 insertions(+), 28 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 639e0d2089a..765a70ce6b2 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -22,11 +22,6 @@ from opentelemetry import context as otel_context from opentelemetry.sdk._logs.export import LogExporter, LogExportResult -if TYPE_CHECKING: - from opentelemetry.sdk._logs import LogData - - from zenml.utils.logging_utils import LoggingContext - from zenml.artifacts.utils import _load_artifact_store from zenml.client import Client from zenml.enums import LoggingLevels @@ -39,15 +34,15 @@ from zenml.utils.logging_utils import LogEntry from zenml.utils.time_utils import utc_now +if TYPE_CHECKING: + from opentelemetry.sdk._logs import LogData + + from zenml.utils.logging_utils import LoggingContext logger = get_logger(__name__) class ArtifactLogExporter(LogExporter): - """OpenTelemetry exporter that writes logs to ZenML artifact store. - - Groups logs by context and writes them to the appropriate artifact store - location based on the filesystem type. - """ + """OpenTelemetry exporter that writes logs to ZenML artifact store.""" def __init__(self) -> None: """Initialize the exporter with file counters per context.""" diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index 4abd7ae68ff..fcf701e29d6 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -257,8 +257,7 @@ def fetch( logs_model: "LogsResponse", start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, - limit: int = 20000, - message_size: int = 5120, + limit: int = MAX_ENTRIES_PER_REQUEST ) -> List["LogEntry"]: """Fetch logs from the artifact store. @@ -286,6 +285,12 @@ def fetch( "for ArtifactLogStore.fetch()" ) + if start_time or end_time: + logger.warning( + "start_time and end_time are not supported for " + "ArtifactLogStore.fetch(). Both parameters will be ignored." + ) + client = Client() log_entries = fetch_log_records( zen_store=client.zen_store, @@ -293,15 +298,4 @@ def fetch( logs_uri=logs_model.uri, ) - if start_time or end_time: - filtered_entries = [] - for entry in log_entries: - if entry.timestamp: - if start_time and entry.timestamp < start_time: - continue - if end_time and entry.timestamp > end_time: - continue - filtered_entries.append(entry) - log_entries = filtered_entries - return log_entries[:limit] diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index 62912a42a4d..86da15e27e4 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -27,14 +27,9 @@ class OtelLogStoreConfig(BaseLogStoreConfig): Attributes: service_name: Name of the service (defaults to "zenml"). - service_version: Version of the service. - deployment_environment: Deployment environment (e.g., "production"). max_queue_size: Maximum queue size for batch processor. schedule_delay_millis: Delay between batch exports in milliseconds. max_export_batch_size: Maximum batch size for exports. - endpoint: Optional OTLP endpoint URL (for HTTP/gRPC exporters). - headers: Optional headers for OTLP exporter. - insecure: Whether to use insecure connection for OTLP. """ service_name: str = Field( From a783b62a2c4f49fbd89104fd2c7dff6762036bbf Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 18:28:04 +0100 Subject: [PATCH 37/81] more minor changes --- src/zenml/log_stores/artifact/artifact_log_store.py | 2 +- src/zenml/log_stores/otel/otel_flavor.py | 5 +++++ src/zenml/log_stores/otel/otel_log_store.py | 11 +++-------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index fcf701e29d6..369ee300207 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -257,7 +257,7 @@ def fetch( logs_model: "LogsResponse", start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, - limit: int = MAX_ENTRIES_PER_REQUEST + limit: int = MAX_ENTRIES_PER_REQUEST, ) -> List["LogEntry"]: """Fetch logs from the artifact store. diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index 86da15e27e4..18e93db48e9 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -17,6 +17,7 @@ from pydantic import Field +from zenml import __version__ from zenml.enums import StackComponentType from zenml.log_stores import BaseLogStore, BaseLogStoreConfig from zenml.stack.flavor import Flavor @@ -36,6 +37,10 @@ class OtelLogStoreConfig(BaseLogStoreConfig): default="zenml", description="Name of the service for telemetry", ) + service_version: str = Field( + default=__version__, + description="Version of the service for telemetry", + ) max_queue_size: int = Field( default=8096, description="Maximum queue size for batch log processor", diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index fdb8153c03a..768295456b8 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -25,7 +25,6 @@ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource -from zenml import __version__ from zenml.log_stores.base_log_store import BaseLogStore from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.logger import get_logger @@ -38,19 +37,15 @@ logger = get_logger(__name__) -# Context key for passing LoggingContext through OTel's context system + LOGGING_CONTEXT_KEY = otel_context.create_key("zenml.logging_context") class OtelLogStore(BaseLogStore): """Log store that exports logs using OpenTelemetry. - Each instance creates its own BatchLogRecordProcessor and background thread. - This is simpler than shared infrastructure but means more threads when - multiple log stores are active simultaneously. - Subclasses should implement `get_exporter()` to provide the specific - log exporter for their backend (e.g., ArtifactLogExporter, DatadogLogExporter). + log exporter for their backend. """ def __init__(self, *args: Any, **kwargs: Any) -> None: @@ -96,7 +91,7 @@ def activate(self) -> None: self._resource = Resource.create( { "service.name": self.config.service_name, - "service.version": __version__, + "service.version": self.config.service_version, } ) From 0e71b7718140d20c2dcbe0bc48e9ec5cd7b81bc5 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 18:31:07 +0100 Subject: [PATCH 38/81] new default constants --- src/zenml/log_stores/artifact/artifact_log_exporter.py | 4 +++- src/zenml/log_stores/base_log_store.py | 4 ---- src/zenml/log_stores/datadog/datadog_log_store.py | 4 ++-- src/zenml/log_stores/otel/otel_log_store.py | 8 +++++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 765a70ce6b2..d7f86343be3 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -28,7 +28,6 @@ from zenml.log_stores.artifact.artifact_log_store import ( remove_ansi_escape_codes, ) -from zenml.log_stores.base_log_store import DEFAULT_MESSAGE_SIZE from zenml.log_stores.otel.otel_log_store import LOGGING_CONTEXT_KEY from zenml.logger import get_logger from zenml.utils.logging_utils import LogEntry @@ -38,6 +37,9 @@ from opentelemetry.sdk._logs import LogData from zenml.utils.logging_utils import LoggingContext + +DEFAULT_MESSAGE_SIZE = 5 * 1024 + logger = get_logger(__name__) diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index 025c53ef155..9a4f8d812cb 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -23,10 +23,7 @@ from zenml.stack import Flavor, StackComponent, StackComponentConfig from zenml.utils.logging_utils import LogEntry, LoggingContext -# Maximum number of log entries to return in a single request MAX_ENTRIES_PER_REQUEST = 20000 -# Maximum size of a single log message in bytes (5KB) -DEFAULT_MESSAGE_SIZE = 5 * 1024 class BaseLogStoreConfig(StackComponentConfig): @@ -70,7 +67,6 @@ def fetch( start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, limit: int = MAX_ENTRIES_PER_REQUEST, - message_size: int = DEFAULT_MESSAGE_SIZE, ) -> List[LogEntry]: """Fetch logs from the log store. diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 9b8f8c7da9b..f41a0362d83 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -20,6 +20,7 @@ from opentelemetry.sdk._logs.export import LogExporter from zenml.enums import LoggingLevels +from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreConfig from zenml.log_stores.otel.otel_log_store import OtelLogStore from zenml.logger import get_logger @@ -65,8 +66,7 @@ def fetch( logs_model: "LogsResponse", start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, - limit: int = 20000, - message_size: int = 5120, + limit: int = MAX_ENTRIES_PER_REQUEST, ) -> List["LogEntry"]: """Fetch logs from Datadog's API. diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 768295456b8..0f6ad44d2ec 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -25,7 +25,10 @@ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource -from zenml.log_stores.base_log_store import BaseLogStore +from zenml.log_stores.base_log_store import ( + MAX_ENTRIES_PER_REQUEST, + BaseLogStore, +) from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.logger import get_logger from zenml.models import LogsResponse @@ -191,8 +194,7 @@ def fetch( logs_model: "LogsResponse", start_time: Optional[datetime] = None, end_time: Optional[datetime] = None, - limit: int = 20000, - message_size: int = 5120, + limit: int = MAX_ENTRIES_PER_REQUEST, ) -> List["LogEntry"]: """Fetch logs from the OpenTelemetry backend. From 2996978949f4bd69d777d19a0f5c99ab672650c7 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 18:39:42 +0100 Subject: [PATCH 39/81] some more minor fixes --- src/zenml/utils/logging_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 8765139ef1a..c236e138d9f 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -331,8 +331,7 @@ def fetch_logs( except ImportError: raise NotImplementedError( f"Log store '{log_store_model.name}' could not be " - f"instantiated. This is likely because the log store's " - f"dependencies are not installed." + "instantiated." ) else: from zenml.log_stores.artifact.artifact_log_store import ( From 36c9752cda6aed2a6d062f41b729fb616f4cfa9d Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 20 Nov 2025 18:54:50 +0100 Subject: [PATCH 40/81] fix --- .../orchestrators/kubernetes_orchestrator_entrypoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index ef793db024a..2d0ba54eb02 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -258,7 +258,7 @@ def main() -> None: orchestrator_run_id = orchestrator_pod_name if args.run_id: - pipeline_run = client.zen_store.get_pipeline_run(args.run_id) + pipeline_run = client.get_pipeline_run(args.run_id) else: pipeline_run = create_placeholder_run( snapshot=snapshot, From 440e2e46c3eb6ecbf8fccb771261b47e349f4bf4 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Fri, 21 Nov 2025 14:48:56 +0100 Subject: [PATCH 41/81] some fixes --- src/zenml/logger.py | 43 ++++++++++++++++++++++++-------- src/zenml/utils/logging_utils.py | 4 ++- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/zenml/logger.py b/src/zenml/logger.py index 0a732ee3c2f..bf7e66cdbf6 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -39,12 +39,37 @@ "step_names_in_console", default=False ) -_original_stdout: Optional[Any] = None -_original_stderr: Optional[Any] = None +_original_stdout_write: Optional[Any] = None +_original_stderr_write: Optional[Any] = None _stdout_wrapped: bool = False _stderr_wrapped: bool = False +class _ZenMLStdoutStream: + """Stream that writes to the original stdout, bypassing the ZenML wrapper. + + This ensures console logging doesn't trigger the LoggingContext wrapper, + preventing duplicate log entries in stored logs. + """ + + def write(self, text: str) -> int: + """Write text to the original stdout. + + Args: + text: The text to write. + + Returns: + The number of characters written. + """ + if _original_stdout_write: + return _original_stdout_write(text) + return sys.stdout.write(text) + + def flush(self) -> None: + """Flush the stdout buffer.""" + sys.stdout.flush() + + def get_logger(logger_name: str) -> logging.Logger: """Main function to get logger name,. @@ -249,18 +274,16 @@ def wrapped_write(text: str) -> int: def wrap_stdout_stderr() -> None: """Wrap stdout and stderr write methods to route through LoggingContext.""" global _stdout_wrapped, _stderr_wrapped - global _original_stdout, _original_stderr + global _original_stdout_write, _original_stderr_write if not _stdout_wrapped: - _original_stdout = sys.stdout - original_write = sys.stdout.write - sys.stdout.write = _wrapped_write(original_write, "stdout") + _original_stdout_write = getattr(sys.stdout, "write") + setattr(sys.stdout, "write", _wrapped_write(_original_stdout_write, "stdout")) _stdout_wrapped = True if not _stderr_wrapped: - _original_stderr = sys.stderr - original_write = sys.stderr.write - sys.stderr.write = _wrapped_write(original_write, "stderr") + _original_stderr_write = getattr(sys.stderr, "write") + setattr(sys.stderr, "write", _wrapped_write(_original_stderr_write, "stderr")) _stderr_wrapped = True @@ -284,7 +307,7 @@ def get_console_handler() -> logging.Handler: Returns: A console handler. """ - handler = logging.StreamHandler(_original_stdout) + handler = logging.StreamHandler(_ZenMLStdoutStream()) handler.setFormatter(ConsoleFormatter()) return handler diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index c236e138d9f..07228b2c132 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -128,7 +128,9 @@ def emit(cls, record: logging.LogRecord) -> None: """ try: if context := active_logging_context.get(): - Client().active_stack.log_store.emit(record, context) + message = record.getMessage() + if message and message.strip(): + Client().active_stack.log_store.emit(record, context) except Exception: pass From b1f11996b53a2902543d98831c486141830e3826 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Fri, 21 Nov 2025 15:57:22 +0100 Subject: [PATCH 42/81] new try --- src/zenml/log_stores/otel/otel_log_store.py | 16 ++++++++++++++- src/zenml/logger.py | 22 ++++++++++++++------- src/zenml/orchestrators/step_launcher.py | 15 +++++++------- src/zenml/utils/logging_utils.py | 14 ++++++++++--- 4 files changed, 48 insertions(+), 19 deletions(-) diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 0f6ad44d2ec..473ffc0c43c 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -126,12 +126,26 @@ def emit( schema_url=None, ) + # Get the message and append formatted exception if present + message = record.getMessage() + if record.exc_info: + import traceback + + exc_text = "".join( + traceback.format_exception(*record.exc_info) + ) + # Append to message with separator if message exists + if message: + message = f"{message}\n{exc_text}" + else: + message = exc_text + otel_logger.emit( timestamp=int(record.created * 1e9), observed_timestamp=int(record.created * 1e9), severity_number=self._get_severity_number(record.levelno), severity_text=record.levelname, - body=record.getMessage(), + body=message, attributes={ "code.filepath": record.pathname, "code.lineno": record.lineno, diff --git a/src/zenml/logger.py b/src/zenml/logger.py index bf7e66cdbf6..ff950737b84 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -47,24 +47,24 @@ class _ZenMLStdoutStream: """Stream that writes to the original stdout, bypassing the ZenML wrapper. - + This ensures console logging doesn't trigger the LoggingContext wrapper, preventing duplicate log entries in stored logs. """ - + def write(self, text: str) -> int: """Write text to the original stdout. - + Args: text: The text to write. - + Returns: The number of characters written. """ if _original_stdout_write: return _original_stdout_write(text) return sys.stdout.write(text) - + def flush(self) -> None: """Flush the stdout buffer.""" sys.stdout.flush() @@ -278,12 +278,20 @@ def wrap_stdout_stderr() -> None: if not _stdout_wrapped: _original_stdout_write = getattr(sys.stdout, "write") - setattr(sys.stdout, "write", _wrapped_write(_original_stdout_write, "stdout")) + setattr( + sys.stdout, + "write", + _wrapped_write(_original_stdout_write, "stdout"), + ) _stdout_wrapped = True if not _stderr_wrapped: _original_stderr_write = getattr(sys.stderr, "write") - setattr(sys.stderr, "write", _wrapped_write(_original_stderr_write, "stderr")) + setattr( + sys.stderr, + "write", + _wrapped_write(_original_stderr_write, "stderr"), + ) _stderr_wrapped = True diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index f8c3399d707..3c57af5cfd1 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -327,6 +327,7 @@ def launch(self) -> StepRunResponse: if step_run.logs: logs_context = LoggingContext(log_model=step_run.logs) + start_time = time.time() with logs_context: try: # TODO: We still need to apply the fix for step operators here @@ -345,6 +346,12 @@ def launch(self) -> StepRunResponse: ) publish_utils.publish_failed_step_run(step_run.id) raise + + duration = time.time() - start_time + logger.info( + f"Step `{self._invocation_id}` has finished in " + f"`{string_utils.get_human_readable_time(duration)}`." + ) else: logger.info( f"Using cached version of step `{self._invocation_id}`." @@ -431,8 +438,6 @@ def _run_step( skip_artifact_materialization=runtime.should_skip_artifact_materialization(), ) - start_time = time.time() - # To have a cross-platform compatible handling of main thread termination # we use Python's interrupt_main instead of termination signals (not Windows supported). # Since interrupt_main raises KeyboardInterrupt we want in this context to capture it @@ -514,12 +519,6 @@ def _run_step( finally: heartbeat_worker.stop() - duration = time.time() - start_time - logger.info( - f"Step `{self._invocation_id}` has finished in " - f"`{string_utils.get_human_readable_time(duration)}`." - ) - def _run_step_with_step_operator( self, step_operator_name: Optional[str], diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 07228b2c132..780d903df69 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -160,9 +160,17 @@ def __exit__( exc_tb: The traceback of the exception. """ if exc_type is not None: - logger.error( - "An exception has occurred.", - exc_info=(exc_type, exc_val, exc_tb) if exc_val else None, + LoggingContext.emit( + logging.LogRecord( + name="", + level=logging.ERROR, + msg="An exception has occurred.", + args=(), + exc_info=(exc_type, exc_val, exc_tb) if exc_val else None, + func=None, + pathname="", + lineno=0, + ) ) with self._lock: From d3388180da54d334a6f65eb00cc6fb039e83ade2 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Mon, 24 Nov 2025 14:09:27 +0100 Subject: [PATCH 43/81] Fix infinite loop on debug logs --- .../log_stores/artifact/artifact_log_exporter.py | 4 ++-- .../log_stores/datadog/datadog_log_exporter.py | 4 ++-- src/zenml/utils/logging_utils.py | 13 ++++++++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index d7f86343be3..2c14dab65c5 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -91,8 +91,8 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: return LogExportResult.SUCCESS - except Exception as e: - logger.error(f"Failed to export logs to artifact store: {e}") + except Exception: + logger.exception("Failed to export logs to artifact store") return LogExportResult.FAILURE def _otel_record_to_log_entries( diff --git a/src/zenml/log_stores/datadog/datadog_log_exporter.py b/src/zenml/log_stores/datadog/datadog_log_exporter.py index d8f7937a6e0..81b440b488c 100644 --- a/src/zenml/log_stores/datadog/datadog_log_exporter.py +++ b/src/zenml/log_stores/datadog/datadog_log_exporter.py @@ -103,8 +103,8 @@ def export(self, batch: List[LogData]) -> Any: f"Datadog rejected logs: {response.status_code} - {response.text[:200]}" ) return LogExportResult.FAILURE - except Exception as e: - logger.error(f"Failed to export logs to Datadog: {e}") + except Exception: + logger.exception("Failed to export logs to Datadog") return LogExportResult.FAILURE def shutdown(self) -> None: diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 780d903df69..db4b120a498 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -115,6 +115,7 @@ def __init__( self.log_model = log_model self._lock = threading.Lock() self._previous_context: Optional[LoggingContext] = None + self._disabled = False @classmethod def emit(cls, record: logging.LogRecord) -> None: @@ -128,9 +129,15 @@ def emit(cls, record: logging.LogRecord) -> None: """ try: if context := active_logging_context.get(): - message = record.getMessage() - if message and message.strip(): - Client().active_stack.log_store.emit(record, context) + if context._disabled: + return + context._disabled = True + try: + message = record.getMessage() + if message and message.strip(): + Client().active_stack.log_store.emit(record, context) + finally: + context._disabled = False except Exception: pass From 14fdc001823e8d7d40026a095aba1f0566dc9653 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Mon, 24 Nov 2025 14:20:41 +0100 Subject: [PATCH 44/81] Log exceptions raised during the logger context emit calls --- src/zenml/utils/logging_utils.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index db4b120a498..100a4e7cce4 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -127,19 +127,18 @@ def emit(cls, record: logging.LogRecord) -> None: Args: record: The log record to emit. """ - try: - if context := active_logging_context.get(): - if context._disabled: - return - context._disabled = True - try: - message = record.getMessage() - if message and message.strip(): - Client().active_stack.log_store.emit(record, context) - finally: - context._disabled = False - except Exception: - pass + if context := active_logging_context.get(): + if context._disabled: + return + context._disabled = True + try: + message = record.getMessage() + if message and message.strip(): + Client().active_stack.log_store.emit(record, context) + except Exception: + logger.debug("Failed to emit log record", exc_info=True) + finally: + context._disabled = False def __enter__(self) -> "LoggingContext": """Enter the context and set as active. From c5c377a6c819490ade25785612e9d42452cbaaa9 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Mon, 24 Nov 2025 17:48:06 +0100 Subject: [PATCH 45/81] Decoupled logging context from the log store and added flush method to the log store abstraction. --- .../artifact/artifact_log_exporter.py | 32 +++--- src/zenml/log_stores/base_log_store.py | 39 +++++++- src/zenml/log_stores/otel/otel_log_store.py | 97 +++++++++---------- src/zenml/utils/logging_utils.py | 5 +- 4 files changed, 102 insertions(+), 71 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 2c14dab65c5..340db4223f3 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -30,13 +30,13 @@ ) from zenml.log_stores.otel.otel_log_store import LOGGING_CONTEXT_KEY from zenml.logger import get_logger +from zenml.models import LogsResponse from zenml.utils.logging_utils import LogEntry from zenml.utils.time_utils import utc_now if TYPE_CHECKING: from opentelemetry.sdk._logs import LogData - from zenml.utils.logging_utils import LoggingContext DEFAULT_MESSAGE_SIZE = 5 * 1024 @@ -64,20 +64,20 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: try: logs_by_context: Dict[UUID, List[str]] = defaultdict(list) - context_metadata: Dict[UUID, "LoggingContext"] = {} + log_models: Dict[UUID, "LogsResponse"] = {} for log_data in batch: if not log_data.log_record.context: continue - context = otel_context.get_value( + log_model = otel_context.get_value( LOGGING_CONTEXT_KEY, log_data.log_record.context ) - if not context: + if not log_model: continue - log_id = context.log_model.id - context_metadata[log_id] = context + log_id = log_model.id + log_models[log_id] = log_model entries = self._otel_record_to_log_entries(log_data.log_record) for entry in entries: @@ -86,8 +86,8 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: for log_id, log_lines in logs_by_context.items(): if log_lines: - context = context_metadata[log_id] - self._write_to_artifact_store(log_lines, context, log_id) + log_model = log_models[log_id] + self._write_to_artifact_store(log_lines, log_model) return LogExportResult.SUCCESS @@ -222,20 +222,18 @@ def _split_to_chunks(self, message: str) -> List[str]: def _write_to_artifact_store( self, log_lines: List[str], - context: "LoggingContext", - log_id: UUID, + log_model: "LogsResponse", ) -> None: """Write log lines to the artifact store. Args: log_lines: List of JSON-serialized log entries. - context: The LoggingContext containing log_model metadata. + log_model: The log model. log_id: The log ID for tracking file counters. """ - log_model = context.log_model if not log_model.uri or not log_model.artifact_store_id: logger.warning( - f"Skipping log write: missing uri or artifact_store_id for log {log_id}" + f"Skipping log write: missing uri or artifact_store_id for log {log_model.id}" ) return @@ -249,13 +247,13 @@ def _write_to_artifact_store( if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: timestamp = int(time.time() * 1000) - if log_id not in self.file_counters: - self.file_counters[log_id] = 0 - self.file_counters[log_id] += 1 + if log_model.id not in self.file_counters: + self.file_counters[log_model.id] = 0 + self.file_counters[log_model.id] += 1 file_uri = os.path.join( log_model.uri, - f"{timestamp}_{self.file_counters[log_id]}.jsonl", + f"{timestamp}_{self.file_counters[log_model.id]}.jsonl", ) with artifact_store.open(file_uri, "w") as f: diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index 9a4f8d812cb..afc18d56d74 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -14,14 +14,15 @@ """Base class for log stores.""" import logging +import threading from abc import abstractmethod from datetime import datetime -from typing import List, Optional, Type, cast +from typing import Any, List, Optional, Type, cast from zenml.enums import StackComponentType from zenml.models import LogsResponse from zenml.stack import Flavor, StackComponent, StackComponentConfig -from zenml.utils.logging_utils import LogEntry, LoggingContext +from zenml.utils.logging_utils import LogEntry MAX_ENTRIES_PER_REQUEST = 20000 @@ -38,6 +39,17 @@ class BaseLogStore(StackComponent): logs in different backends (artifact store, OpenTelemetry, Datadog, etc.). """ + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Initialize the log store. + + Args: + *args: Positional arguments for the base class. + **kwargs: Keyword arguments for the base class. + """ + super().__init__(*args, **kwargs) + self._emitter_counter = 0 + self._lock = threading.RLock() + @property def config(self) -> BaseLogStoreConfig: """Returns the configuration of the log store. @@ -51,13 +63,32 @@ def config(self) -> BaseLogStoreConfig: def emit( self, record: logging.LogRecord, - context: LoggingContext, + log_model: LogsResponse, ) -> None: """Process a log record from the logging system. Args: record: The Python logging.LogRecord to process. - context: The logging context containing the log_model. + log_model: The log model to emit the log record to. + """ + + def register_emitter(self) -> None: + """Register an emitter for the log store.""" + with self._lock: + self._emitter_counter += 1 + + def deregister_emitter(self) -> None: + """Deregister an emitter for the log store.""" + with self._lock: + self._emitter_counter -= 1 + if self._emitter_counter == 0: + self.flush() + + @abstractmethod + def flush(self) -> None: + """Flush the log store. + + This method is called to ensure that all logs are flushed to the backend. """ @abstractmethod diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 473ffc0c43c..2590fc31572 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -14,7 +14,6 @@ """OpenTelemetry log store implementation.""" import logging -import threading from abc import abstractmethod from datetime import datetime from typing import TYPE_CHECKING, Any, List, Optional, cast @@ -36,11 +35,10 @@ if TYPE_CHECKING: from opentelemetry.sdk._logs.export import LogExporter - from zenml.utils.logging_utils import LogEntry, LoggingContext + from zenml.utils.logging_utils import LogEntry logger = get_logger(__name__) - LOGGING_CONTEXT_KEY = otel_context.create_key("zenml.logging_context") @@ -64,7 +62,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._exporter: Optional["LogExporter"] = None self._provider: Optional["LoggerProvider"] = None self._processor: Optional["BatchLogRecordProcessor"] = None - self._activation_lock = threading.Lock() @property def config(self) -> OtelLogStoreConfig: @@ -104,60 +101,62 @@ def activate(self) -> None: def emit( self, record: logging.LogRecord, - context: "LoggingContext", + log_model: "LogsResponse", ) -> None: """Process a log record by sending to OpenTelemetry. Args: record: The log record to process. - context: The logging context containing the log_model. + log_model: The log model to emit the log record to. """ - with self._activation_lock: + with self._lock: if not self._provider: self.activate() - try: - # Attach the LoggingContext to OTel's context so the exporter - # can access it in the background processor thread - ctx = otel_context.set_value(LOGGING_CONTEXT_KEY, context) - - otel_logger = self._provider.get_logger( - record.name or "unknown", - schema_url=None, - ) - - # Get the message and append formatted exception if present - message = record.getMessage() - if record.exc_info: - import traceback - - exc_text = "".join( - traceback.format_exception(*record.exc_info) - ) - # Append to message with separator if message exists - if message: - message = f"{message}\n{exc_text}" - else: - message = exc_text - - otel_logger.emit( - timestamp=int(record.created * 1e9), - observed_timestamp=int(record.created * 1e9), - severity_number=self._get_severity_number(record.levelno), - severity_text=record.levelname, - body=message, - attributes={ - "code.filepath": record.pathname, - "code.lineno": record.lineno, - "code.function": record.funcName, - "log_id": str(context.log_model.id), - "log_store_id": str(self.id), - }, - context=ctx, - ) - - except Exception: - pass + # Attach the LoggingContext to OTel's context so the exporter + # can access it in the background processor thread + ctx = otel_context.set_value(LOGGING_CONTEXT_KEY, log_model) + + otel_logger = self._provider.get_logger( + record.name or "unknown", + schema_url=None, + ) + + # Get the message and append formatted exception if present + message = record.getMessage() + if record.exc_info: + import traceback + + exc_text = "".join(traceback.format_exception(*record.exc_info)) + # Append to message with separator if message exists + if message: + message = f"{message}\n{exc_text}" + else: + message = exc_text + + otel_logger.emit( + timestamp=int(record.created * 1e9), + observed_timestamp=int(record.created * 1e9), + severity_number=self._get_severity_number(record.levelno), + severity_text=record.levelname, + body=message, + attributes={ + "code.filepath": record.pathname, + "code.lineno": record.lineno, + "code.function": record.funcName, + "log_id": str(log_model.id), + "log_store_id": str(self.id), + }, + context=ctx, + ) + + def flush(self) -> None: + """Flush the log store. + + This method is called to ensure that all logs are flushed to the backend. + """ + if self._processor: + self._processor.force_flush() def _get_severity_number(self, levelno: int) -> int: """Map Python log level to OTEL severity number. diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 100a4e7cce4..6cff75756f1 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -116,6 +116,7 @@ def __init__( self._lock = threading.Lock() self._previous_context: Optional[LoggingContext] = None self._disabled = False + self._log_store = Client().active_stack.log_store @classmethod def emit(cls, record: logging.LogRecord) -> None: @@ -134,7 +135,7 @@ def emit(cls, record: logging.LogRecord) -> None: try: message = record.getMessage() if message and message.strip(): - Client().active_stack.log_store.emit(record, context) + context._log_store.emit(record, context.log_model) except Exception: logger.debug("Failed to emit log record", exc_info=True) finally: @@ -149,6 +150,7 @@ def __enter__(self) -> "LoggingContext": with self._lock: self._previous_context = active_logging_context.get() active_logging_context.set(self) + self._log_store.register_emitter() return self @@ -181,6 +183,7 @@ def __exit__( with self._lock: active_logging_context.set(self._previous_context) + self._log_store.deregister_emitter() def generate_logs_request(source: str) -> LogsRequest: From 9dac5340b2a62000d91f0122cb0912817524fc40 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 27 Nov 2025 10:26:44 +0100 Subject: [PATCH 46/81] optimizing --- .../artifact/artifact_log_exporter.py | 20 ++++++++----------- .../log_stores/artifact/artifact_log_store.py | 5 ++++- .../artifact/artifact_log_store_flavor.py | 5 +---- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 340db4223f3..2dfc83ce108 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -22,8 +22,7 @@ from opentelemetry import context as otel_context from opentelemetry.sdk._logs.export import LogExporter, LogExportResult -from zenml.artifacts.utils import _load_artifact_store -from zenml.client import Client +from zenml.artifact_stores.base_artifact_store import BaseArtifactStore from zenml.enums import LoggingLevels from zenml.log_stores.artifact.artifact_log_store import ( remove_ansi_escape_codes, @@ -46,9 +45,10 @@ class ArtifactLogExporter(LogExporter): """OpenTelemetry exporter that writes logs to ZenML artifact store.""" - def __init__(self) -> None: + def __init__(self, artifact_store: "BaseArtifactStore") -> None: """Initialize the exporter with file counters per context.""" self.file_counters: Dict[UUID, int] = {} + self.artifact_store = artifact_store def export(self, batch: Sequence["LogData"]) -> LogExportResult: """Export a batch of logs to the artifact store. @@ -237,15 +237,10 @@ def _write_to_artifact_store( ) return - client = Client() - artifact_store = _load_artifact_store( - log_model.artifact_store_id, client.zen_store - ) - try: content = "\n".join(log_lines) + "\n" - if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: timestamp = int(time.time() * 1000) if log_model.id not in self.file_counters: self.file_counters[log_model.id] = 0 @@ -256,17 +251,18 @@ def _write_to_artifact_store( f"{timestamp}_{self.file_counters[log_model.id]}.jsonl", ) - with artifact_store.open(file_uri, "w") as f: + with self.artifact_store.open(file_uri, "w") as f: f.write(content) else: - with artifact_store.open(log_model.uri, "a") as f: + with self.artifact_store.open(log_model.uri, "a") as f: f.write(content) except Exception as e: logger.error(f"Failed to write logs to {log_model.uri}: {e}") raise finally: - artifact_store.cleanup() + self.artifact_store.cleanup() def shutdown(self) -> None: """Shutdown the exporter.""" + # TODO: Merge pass diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index 369ee300207..accc33766da 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -246,11 +246,14 @@ def get_exporter(self) -> "LogExporter": Returns: The ArtifactLogExporter instance. """ + from zenml.client import Client from zenml.log_stores.artifact.artifact_log_exporter import ( ArtifactLogExporter, ) - return ArtifactLogExporter() + return ArtifactLogExporter( + artifact_store=Client().active_stack.artifact_store + ) def fetch( self, diff --git a/src/zenml/log_stores/artifact/artifact_log_store_flavor.py b/src/zenml/log_stores/artifact/artifact_log_store_flavor.py index 062b426378d..7923626c285 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store_flavor.py +++ b/src/zenml/log_stores/artifact/artifact_log_store_flavor.py @@ -25,10 +25,7 @@ class ArtifactLogStoreConfig(OtelLogStoreConfig): - """Configuration for the artifact log store. - - This log store saves logs to the artifact store using OTEL infrastructure. - """ + """Configuration for the artifact log store.""" class ArtifactLogStoreFlavor(OtelLogStoreFlavor): From 33124736b83a5e2d31977befcf2788e93e12124a Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sun, 30 Nov 2025 03:52:38 +0100 Subject: [PATCH 47/81] some changes --- .../execution/pipeline/dynamic/runner.py | 19 ++- .../kubernetes_orchestrator_entrypoint.py | 17 ++- .../artifact/artifact_log_exporter.py | 111 +++++++++++++---- .../log_stores/artifact/artifact_log_store.py | 25 +++- src/zenml/log_stores/otel/otel_log_store.py | 10 +- src/zenml/models/__init__.py | 1 + src/zenml/models/v2/core/step_run.py | 43 +++++-- src/zenml/orchestrators/step_launcher.py | 14 ++- src/zenml/orchestrators/step_runner.py | 25 ++-- src/zenml/pipelines/pipeline_definition.py | 40 ++---- src/zenml/stack/stack.py | 2 + src/zenml/utils/logging_utils.py | 116 +++++++++++++++--- .../zen_server/routers/steps_endpoints.py | 25 ++-- .../versions/5c0a1c787128_add_log_stores.py | 2 +- .../zen_stores/schemas/step_run_schemas.py | 14 ++- 15 files changed, 333 insertions(+), 131 deletions(-) diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index 57e25983c8c..aa2ba71bd92 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -18,6 +18,7 @@ import inspect import itertools from concurrent.futures import ThreadPoolExecutor +from contextlib import nullcontext from typing import ( TYPE_CHECKING, Any, @@ -67,7 +68,10 @@ from zenml.steps.entrypoint_function_utils import StepArtifact from zenml.steps.utils import OutputSignature from zenml.utils import source_utils -from zenml.utils.logging_utils import setup_orchestrator_logging +from zenml.utils.logging_utils import ( + is_pipeline_logging_enabled, + setup_run_logging, +) if TYPE_CHECKING: from zenml.config import DockerSettings @@ -157,9 +161,16 @@ def run_pipeline(self) -> None: orchestrator_run_id=self._orchestrator_run_id, ) - logging_context = setup_orchestrator_logging( - pipeline_run=run, snapshot=self._snapshot - ) + logging_context = nullcontext() + if is_pipeline_logging_enabled( + self._snapshot.pipeline_configuration + ): + logging_context = setup_run_logging( + pipeline_run=run, + snapshot=self._snapshot, + source="orchestrator", + ) + with logging_context: assert ( self._snapshot.pipeline_spec diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index 2d0ba54eb02..cfc85755692 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -18,6 +18,7 @@ import socket import threading import time +from contextlib import nullcontext from typing import List, Optional, Tuple, cast from uuid import UUID @@ -71,7 +72,10 @@ ) from zenml.pipelines.run_utils import create_placeholder_run from zenml.utils import env_utils -from zenml.utils.logging_utils import setup_orchestrator_logging +from zenml.utils.logging_utils import ( + is_pipeline_logging_enabled, + setup_run_logging, +) logger = get_logger(__name__) @@ -281,10 +285,13 @@ def main() -> None: for step_name, step in snapshot.step_configurations.items() ] - logs_context = setup_orchestrator_logging( - pipeline_run=pipeline_run, - snapshot=snapshot, - ) + logs_context = nullcontext() + if is_pipeline_logging_enabled(snapshot.pipeline_configuration): + logs_context = setup_run_logging( + pipeline_run=pipeline_run, + source="orchestrator", + ) + with logs_context: step_command = StepEntrypointConfiguration.get_entrypoint_command() mount_local_stores = active_stack.orchestrator.config.is_local diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 2dfc83ce108..11d6776f433 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -27,7 +27,10 @@ from zenml.log_stores.artifact.artifact_log_store import ( remove_ansi_escape_codes, ) -from zenml.log_stores.otel.otel_log_store import LOGGING_CONTEXT_KEY +from zenml.log_stores.otel.otel_log_store import ( + ZENML_LOGGING_CONTEXT_EXIT_TOKEN, + ZENML_OTEL_LOG_STORE_CONTEXT_KEY, +) from zenml.logger import get_logger from zenml.models import LogsResponse from zenml.utils.logging_utils import LogEntry @@ -38,6 +41,7 @@ DEFAULT_MESSAGE_SIZE = 5 * 1024 +LOGS_EXTENSION = ".log" logger = get_logger(__name__) @@ -46,8 +50,11 @@ class ArtifactLogExporter(LogExporter): """OpenTelemetry exporter that writes logs to ZenML artifact store.""" def __init__(self, artifact_store: "BaseArtifactStore") -> None: - """Initialize the exporter with file counters per context.""" - self.file_counters: Dict[UUID, int] = {} + """Initialize the exporter. + + Args: + artifact_store: The artifact store to write logs to. + """ self.artifact_store = artifact_store def export(self, batch: Sequence["LogData"]) -> LogExportResult: @@ -63,31 +70,34 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: return LogExportResult.SUCCESS try: - logs_by_context: Dict[UUID, List[str]] = defaultdict(list) - log_models: Dict[UUID, "LogsResponse"] = {} + entries_by_id: Dict[UUID, List[str]] = defaultdict(list) + responses_by_id: Dict[UUID, "LogsResponse"] = {} for log_data in batch: - if not log_data.log_record.context: - continue - log_model = otel_context.get_value( - LOGGING_CONTEXT_KEY, log_data.log_record.context + key=ZENML_OTEL_LOG_STORE_CONTEXT_KEY, + context=log_data.log_record.context, ) if not log_model: continue - log_id = log_model.id - log_models[log_id] = log_model + responses_by_id[log_model.id] = log_model entries = self._otel_record_to_log_entries(log_data.log_record) for entry in entries: json_line = entry.model_dump_json(exclude_none=True) - logs_by_context[log_id].append(json_line) + entries_by_id[log_model.id].append(json_line) - for log_id, log_lines in logs_by_context.items(): + for log_id, log_lines in entries_by_id.items(): if log_lines: - log_model = log_models[log_id] - self._write_to_artifact_store(log_lines, log_model) + log_model = responses_by_id[log_id] + + last = False + if ZENML_LOGGING_CONTEXT_EXIT_TOKEN in log_lines: + last = True + log_lines.pop(-1) + + self._write(log_lines, log_model, last=last) return LogExportResult.SUCCESS @@ -219,17 +229,18 @@ def _split_to_chunks(self, message: str) -> List[str]: return chunks - def _write_to_artifact_store( + def _write( self, log_lines: List[str], log_model: "LogsResponse", + last: bool = False, ) -> None: """Write log lines to the artifact store. Args: log_lines: List of JSON-serialized log entries. log_model: The log model. - log_id: The log ID for tracking file counters. + last: Whether this is the last batch of log lines. """ if not log_model.uri or not log_model.artifact_store_id: logger.warning( @@ -241,27 +252,81 @@ def _write_to_artifact_store( content = "\n".join(log_lines) + "\n" if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - timestamp = int(time.time() * 1000) - if log_model.id not in self.file_counters: - self.file_counters[log_model.id] = 0 - self.file_counters[log_model.id] += 1 - + timestamp = time.time() file_uri = os.path.join( log_model.uri, - f"{timestamp}_{self.file_counters[log_model.id]}.jsonl", + f"{timestamp}{LOGS_EXTENSION}", ) with self.artifact_store.open(file_uri, "w") as f: f.write(content) + + if last: + self._merge(log_model) else: with self.artifact_store.open(log_model.uri, "a") as f: f.write(content) + + if last: + self.artifact_store._remove_previous_file_versions( + log_model.uri + ) + except Exception as e: logger.error(f"Failed to write logs to {log_model.uri}: {e}") raise finally: self.artifact_store.cleanup() + def _merge(self, log_model: "LogsResponse"): + """Merges all log files into one in the given URI. + + Called on the logging context exit. + + Args: + log_model: The log model. + """ + # If the artifact store is immutable, merge the log files + if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + from zenml.artifacts.utils import _load_file_from_artifact_store + from zenml.exceptions import DoesNotExistException + + files_ = self.artifact_store.listdir(log_model.uri) + if len(files_) > 1: + files_.sort() + + missing_files = set() + # dump all logs to a local file first + with self.artifact_store.open( + os.path.join( + log_model.uri, f"{time.time()}_merged{LOGS_EXTENSION}" + ), + "w", + ) as merged_file: + for file in files_: + try: + merged_file.write( + str( + _load_file_from_artifact_store( + os.path.join(self.logs_uri, str(file)), + artifact_store=self.artifact_store, + mode="r", + ) + ) + ) + except DoesNotExistException: + missing_files.add(file) + + # clean up left over files + for file in files_: + if file not in missing_files: + self.artifact_store.remove( + os.path.join(self.logs_uri, str(file)) + ) + + # Update the last merge time + self.last_merge_time = time.time() + def shutdown(self) -> None: """Shutdown the exporter.""" # TODO: Merge diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index accc33766da..f96caf6bad7 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -17,6 +17,7 @@ import re from datetime import datetime from typing import ( + Any, Iterator, List, Optional, @@ -231,6 +232,19 @@ class ArtifactLogStore(OtelLogStore): including shared BatchLogRecordProcessor and routing. """ + def __init__( + self, artifact_store: "BaseArtifactStore", *args: Any, **kwargs: Any + ) -> None: + """Initialize the artifact log store. + + Args: + artifact_store: The artifact store to use for logging. + *args: Positional arguments for the base class. + **kwargs: Keyword arguments for the base class. + """ + super().__init__(*args, **kwargs) + self._artifact_store = artifact_store + @property def config(self) -> ArtifactLogStoreConfig: """Returns the configuration of the artifact log store. @@ -246,14 +260,11 @@ def get_exporter(self) -> "LogExporter": Returns: The ArtifactLogExporter instance. """ - from zenml.client import Client from zenml.log_stores.artifact.artifact_log_exporter import ( ArtifactLogExporter, ) - return ArtifactLogExporter( - artifact_store=Client().active_stack.artifact_store - ) + return ArtifactLogExporter(artifact_store=self._artifact_store) def fetch( self, @@ -288,6 +299,12 @@ def fetch( "for ArtifactLogStore.fetch()" ) + if logs_model.artifact_store_id != self._artifact_store.id: + raise ValueError( + "logs_model.artifact_store_id does not match the artifact store " + "id of the log store." + ) + if start_time or end_time: logger.warning( "start_time and end_time are not supported for " diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 2590fc31572..01ad68c621b 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -39,7 +39,9 @@ logger = get_logger(__name__) -LOGGING_CONTEXT_KEY = otel_context.create_key("zenml.logging_context") +ZENML_OTEL_LOG_STORE_CONTEXT_KEY = otel_context.create_key( + "zenml.logging_context" +) class OtelLogStore(BaseLogStore): @@ -113,9 +115,11 @@ def emit( if not self._provider: self.activate() - # Attach the LoggingContext to OTel's context so the exporter + # Attach the log_model to OTel's context so the exporter # can access it in the background processor thread - ctx = otel_context.set_value(LOGGING_CONTEXT_KEY, log_model) + ctx = otel_context.set_value( + ZENML_OTEL_LOG_STORE_CONTEXT_KEY, log_model + ) otel_logger = self._provider.get_logger( record.name or "unknown", diff --git a/src/zenml/models/__init__.py b/src/zenml/models/__init__.py index e5468dd779e..73f2223cdcc 100644 --- a/src/zenml/models/__init__.py +++ b/src/zenml/models/__init__.py @@ -554,6 +554,7 @@ StackResponseMetadata.model_rebuild() StackResponseResources.model_rebuild() StepRunRequest.model_rebuild() +StepRunUpdate.model_rebuild() StepRunResponseBody.model_rebuild() StepRunResponseMetadata.model_rebuild() StepRunResponseResources.model_rebuild() diff --git a/src/zenml/models/v2/core/step_run.py b/src/zenml/models/v2/core/step_run.py index f5c798101ea..39c28f5d4cf 100644 --- a/src/zenml/models/v2/core/step_run.py +++ b/src/zenml/models/v2/core/step_run.py @@ -204,6 +204,9 @@ class StepRunUpdate(BaseUpdate): "results anymore.", default=None, ) + add_logs: Optional[List["LogsRequest"]] = Field( + default=None, title="New logs to add to the step run." + ) model_config = ConfigDict(protected_namespaces=()) @@ -287,10 +290,6 @@ class StepRunResponseMetadata(ProjectScopedResponseMetadata): ) # References - logs: Optional["LogsResponse"] = Field( - title="Logs associated with this step run.", - default=None, - ) snapshot_id: UUID = Field( title="The snapshot associated with the step run." ) @@ -314,6 +313,15 @@ class StepRunResponseMetadata(ProjectScopedResponseMetadata): class StepRunResponseResources(ProjectScopedResponseResources): """Class for all resource models associated with the step run entity.""" + logs: Optional["LogsResponse"] = Field( + title="Logs associated with this step run.", + default=None, + ) + log_collection: Optional[List["LogsResponse"]] = Field( + title="Logs associated with this step run.", + default=None, + ) + model_version: Optional[ModelVersionResponse] = None inputs: Dict[str, List[StepRunInputResponse]] = Field( title="The input artifact versions of the step run.", @@ -609,15 +617,6 @@ def latest_heartbeat(self) -> Optional[datetime]: """ return self.get_body().latest_heartbeat - @property - def logs(self) -> Optional["LogsResponse"]: - """The `logs` property. - - Returns: - the value of the property. - """ - return self.get_metadata().logs - @property def snapshot_id(self) -> UUID: """The `snapshot_id` property. @@ -663,6 +662,24 @@ def run_metadata(self) -> Dict[str, MetadataType]: """ return self.get_metadata().run_metadata + @property + def logs(self) -> Optional["LogsResponse"]: + """The `logs` property. + + Returns: + the value of the property. + """ + return self.get_resources().logs + + @property + def log_collection(self) -> Optional[List["LogsResponse"]]: + """The `log_collection` property. + + Returns: + the value of the property. + """ + return self.get_resources().log_collection + @property def model_version(self) -> Optional[ModelVersionResponse]: """The `model_version` property. diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 60f46bd9c92..a5a2f86bfbe 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -43,8 +43,9 @@ from zenml.stack import Stack from zenml.utils import env_utils, exception_utils, string_utils from zenml.utils.logging_utils import ( - LoggingContext, generate_logs_request, + is_step_logging_enabled, + setup_step_logging, ) from zenml.utils.time_utils import utc_now @@ -320,13 +321,18 @@ def launch(self) -> StepRunResponse: logger.info(f"Step `{self._invocation_id}` has started.") logs_context = nullcontext() - if step_run.logs: - logs_context = LoggingContext(log_model=step_run.logs) + if is_step_logging_enabled( + step_configuration=step_run.config, + pipeline_configuration=pipeline_run.config, + ): + logs_context = setup_step_logging( + step_run=step_run, + source="step", + ) start_time = time.time() with logs_context: try: - # TODO: We still need to apply the fix for step operators here self._run_step( pipeline_run=pipeline_run, step_run=step_run, diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index 56081d52eb9..e1cc1b3949c 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -33,9 +33,7 @@ from zenml.config.step_configurations import StepConfiguration from zenml.config.step_run_info import StepRunInfo from zenml.constants import ( - ENV_ZENML_DISABLE_STEP_LOGS_STORAGE, ENV_ZENML_STEP_OPERATOR, - handle_bool_env_var, ) from zenml.enums import ArtifactSaveType from zenml.exceptions import StepInterfaceError @@ -76,7 +74,10 @@ string_utils, tag_utils, ) -from zenml.utils.logging_utils import LoggingContext +from zenml.utils.logging_utils import ( + is_step_logging_enabled, + setup_step_logging, +) from zenml.utils.typing_utils import get_args, get_origin, is_union if TYPE_CHECKING: @@ -143,20 +144,12 @@ def run( """ from zenml.deployers.server import runtime - if handle_bool_env_var(ENV_ZENML_DISABLE_STEP_LOGS_STORAGE, False): - step_logging_enabled = False - else: - enabled_on_step = step_run.config.enable_step_logs - enabled_on_pipeline = pipeline_run.config.enable_step_logs - - step_logging_enabled = is_setting_enabled( - is_enabled_on_step=enabled_on_step, - is_enabled_on_pipeline=enabled_on_pipeline, - ) - logs_context = nullcontext() - if step_logging_enabled and step_run.logs: - logs_context = LoggingContext(log_model=step_run.logs) + if is_step_logging_enabled(step_run.config, pipeline_run.config): + logs_context = setup_step_logging( + step_run=step_run, + source="step", + ) with logs_context: step_instance = self._load_step() diff --git a/src/zenml/pipelines/pipeline_definition.py b/src/zenml/pipelines/pipeline_definition.py index e120508185b..61250ca5211 100644 --- a/src/zenml/pipelines/pipeline_definition.py +++ b/src/zenml/pipelines/pipeline_definition.py @@ -52,10 +52,6 @@ from zenml.config.pipeline_spec import PipelineSpec from zenml.config.schedule import Schedule from zenml.config.step_configurations import StepConfigurationUpdate -from zenml.constants import ( - ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, - handle_bool_env_var, -) from zenml.enums import StackComponentType from zenml.exceptions import EntityExistsError from zenml.execution.pipeline.utils import ( @@ -100,7 +96,10 @@ source_utils, yaml_utils, ) -from zenml.utils.logging_utils import LoggingContext, generate_logs_request +from zenml.utils.logging_utils import ( + is_pipeline_logging_enabled, + setup_run_logging, +) from zenml.utils.string_utils import format_name_template from zenml.utils.tag_utils import Tag @@ -1026,39 +1025,22 @@ def _run( with track_handler(AnalyticsEvent.RUN_PIPELINE) as analytics_handler: stack = Client().active_stack - # Enable or disable pipeline run logs storage - if self._run_args.get("schedule"): - # Pipeline runs scheduled to run in the future are not logged - # via the client. - logging_enabled = False - elif handle_bool_env_var( - ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, False - ): - logging_enabled = False - else: - logging_enabled = self._run_args.get( - "enable_pipeline_logs", - self.configuration.enable_pipeline_logs - if self.configuration.enable_pipeline_logs is not None - else True, - ) - snapshot = self._create_snapshot(**self._run_args) self.log_pipeline_snapshot_metadata(snapshot) - logs_request = None - if logging_enabled: - logs_request = generate_logs_request(source="client") - run = ( - create_placeholder_run(snapshot=snapshot, logs=logs_request) + create_placeholder_run(snapshot=snapshot) if not snapshot.schedule else None ) logs_context = nullcontext() - if logging_enabled and run and run.logs: - logs_context = LoggingContext(log_model=run.logs) + if run and is_pipeline_logging_enabled( + snapshot.pipeline_configuration + ): + logs_context = setup_run_logging( + pipeline_run=run, source="client" + ) with logs_context: analytics_handler.metadata = ( diff --git a/src/zenml/stack/stack.py b/src/zenml/stack/stack.py index 72d181e1d21..a7912bc6773 100644 --- a/src/zenml/stack/stack.py +++ b/src/zenml/stack/stack.py @@ -563,6 +563,8 @@ def log_store(self) -> "BaseLogStore": created=now, updated=now, secrets=[], + # Here, we tie the artifact log store to the artifact store + artifact_store=self.artifact_store, ) return self._log_store diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 6cff75756f1..4a38c16e714 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -26,8 +26,10 @@ from zenml.client import Client from zenml.config.pipeline_configurations import PipelineConfiguration +from zenml.config.step_configurations import StepConfiguration from zenml.constants import ( ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, + ENV_ZENML_DISABLE_STEP_LOGS_STORAGE, handle_bool_env_var, ) from zenml.enums import LoggingLevels, StackComponentType @@ -38,8 +40,10 @@ LogsResponse, PipelineRunResponse, PipelineRunUpdate, - PipelineSnapshotResponse, + StepRunResponse, + StepRunUpdate, ) +from zenml.orchestrators.utils import is_setting_enabled from zenml.stack import StackComponent from zenml.utils.time_utils import utc_now @@ -53,6 +57,8 @@ "active_logging_context", default=None ) +ZENML_LOGGING_CONTEXT_EXIT_TOKEN = "__ZENML_LOGGING_CONTEXT_EXIT__" + class LogEntry(BaseModel): """A structured log entry with parsed information. @@ -181,6 +187,18 @@ def __exit__( ) ) + LoggingContext.emit( + logging.LogRecord( + name="", + level=logging.INFO, + msg=ZENML_LOGGING_CONTEXT_EXIT_TOKEN, + args=(), + exc_info=None, + pathname="", + lineno=0, + ) + ) + with self._lock: active_logging_context.set(self._previous_context) self._log_store.deregister_emitter() @@ -223,7 +241,9 @@ def generate_logs_request(source: str) -> LogsRequest: ) -def is_logging_enabled(pipeline_configuration: PipelineConfiguration) -> bool: +def is_pipeline_logging_enabled( + pipeline_configuration: PipelineConfiguration, +) -> bool: """Check if logging is enabled for a pipeline configuration. Args: @@ -240,6 +260,27 @@ def is_logging_enabled(pipeline_configuration: PipelineConfiguration) -> bool: return True +def is_step_logging_enabled( + step_configuration: StepConfiguration, + pipeline_configuration: PipelineConfiguration, +) -> bool: + """Check if logging is enabled for a step configuration. + + Args: + step_configuration: The step configuration. + pipeline_configuration: The pipeline configuration. + + Returns: + True if logging is enabled, False if disabled. + """ + if handle_bool_env_var(ENV_ZENML_DISABLE_STEP_LOGS_STORAGE, False): + return False + else: + is_enabled_on_step = step_configuration.enable_step_logs + is_enabled_on_pipeline = pipeline_configuration.enable_step_logs + return is_setting_enabled(is_enabled_on_step, is_enabled_on_pipeline) + + def search_logs_by_source( logs_collection: List[LogsResponse], source: str ) -> Optional[LogsResponse]: @@ -258,33 +299,28 @@ def search_logs_by_source( return None -def setup_orchestrator_logging( +def setup_run_logging( pipeline_run: "PipelineRunResponse", - snapshot: "PipelineSnapshotResponse", + source: str, ) -> Any: - """Set up logging for an orchestrator environment. + """Set up logging for a pipeline run. - This function can be reused by different orchestrators to set up - consistent logging behavior. + Searches for existing logs by source, updates the run if needed. Args: pipeline_run: The pipeline run. snapshot: The snapshot of the pipeline run. + source: The source of the logs. Returns: - The logs context or nullcontext if logging is disabled. + The logs context. """ - logging_enabled = is_logging_enabled(snapshot.pipeline_configuration) - - if not logging_enabled: - return nullcontext() - if orchestrator_logs := search_logs_by_source( - pipeline_run.log_collection, "orchestrator" + pipeline_run.log_collection, source ): return LoggingContext(log_model=orchestrator_logs) - logs_request = generate_logs_request(source="orchestrator") + logs_request = generate_logs_request(source=source) try: client = Client() run_update = PipelineRunUpdate(add_logs=[logs_request]) @@ -295,13 +331,47 @@ def setup_orchestrator_logging( logger.error(f"Failed to add logs to the run {pipeline_run.id}: {e}") if orchestrator_logs := search_logs_by_source( - pipeline_run.log_collection, "orchestrator" + pipeline_run.log_collection, source ): return LoggingContext(log_model=orchestrator_logs) return nullcontext() +def setup_step_logging( + step_run: "StepRunResponse", + source: str, +) -> Any: + """Set up logging for a step run. + + Searches for existing logs by source, updates the step if needed. + + Args: + step_run: The step run. + source: The source of the logs. + + Returns: + The logs context. + """ + if step_logs := search_logs_by_source(step_run.log_collection, source): + return LoggingContext(log_model=step_logs) + + logs_request = generate_logs_request(source=source) + try: + client = Client() + step_run_update = StepRunUpdate(add_logs=[logs_request]) + step_run = client.zen_store.update_run_step( + step_run_id=step_run.id, step_run_update=step_run_update + ) + except Exception as e: + logger.error(f"Failed to add logs to the step run {step_run.id}: {e}") + + if step_logs := search_logs_by_source(step_run.log_collection, source): + return LoggingContext(log_model=step_logs) + + return nullcontext() + + def fetch_logs( logs: "LogsResponse", zen_store: "BaseZenStore", @@ -353,6 +423,7 @@ def fetch_logs( "instantiated." ) else: + from zenml.artifact_stores.base_artifact_store import BaseArtifactStore from zenml.log_stores.artifact.artifact_log_store import ( ArtifactLogStore, ) @@ -361,6 +432,17 @@ def fetch_logs( ) current_time = utc_now() + + artifact_store = zen_store.get_stack_component(logs.artifact_store_id) + if not artifact_store.type == StackComponentType.ARTIFACT_STORE: + raise DoesNotExistException( + f"Stack component '{logs.artifact_store_id}' is not an artifact store." + ) + + artifact_store = cast( + "BaseArtifactStore", + StackComponent.from_model(artifact_store), + ) log_store = ArtifactLogStore( name="default_artifact_log_store", id=uuid4(), @@ -370,6 +452,8 @@ def fetch_logs( user=uuid4(), created=current_time, updated=current_time, + # Here, we tie the artifact log store to the artifact store + artifact_store=artifact_store, ) return log_store.fetch(logs_model=logs, limit=limit) diff --git a/src/zenml/zen_server/routers/steps_endpoints.py b/src/zenml/zen_server/routers/steps_endpoints.py index 7dfc65098c9..7aa2ca82b44 100644 --- a/src/zenml/zen_server/routers/steps_endpoints.py +++ b/src/zenml/zen_server/routers/steps_endpoints.py @@ -324,12 +324,14 @@ def get_step_status( @async_fastapi_endpoint_wrapper def get_step_logs( step_id: UUID, + source: str = "step", _: AuthContext = Security(authorize), ) -> List[LogEntry]: """Get log entries for a step. Args: step_id: ID of the step for which to get the logs. + source: The source of the logs to get. Default is "step". Returns: List of log entries. @@ -337,16 +339,19 @@ def get_step_logs( Raises: KeyError: If no logs are available for this step. """ - step = zen_store().get_run_step(step_id, hydrate=True) - pipeline_run = zen_store().get_run(step.pipeline_run_id) + store = zen_store() + + step = store.get_run_step(step_id, hydrate=True) + pipeline_run = store.get_run(step.pipeline_run_id) verify_permission_for_model(pipeline_run, action=Action.READ) - # Verify that logs are available for this step - if step.logs is None: - raise KeyError("No logs available for this step.") + if step.log_collection: + for logs_response in step.log_collection: + if logs_response.source == source: + return fetch_logs( + logs=logs_response, + zen_store=store, + limit=MAX_ENTRIES_PER_REQUEST, + ) - return fetch_logs( - logs=step.logs, - zen_store=zen_store(), - limit=MAX_ENTRIES_PER_REQUEST, - ) + raise KeyError(f"No logs found for source '{source}' in step {step_id}") diff --git a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py index 3e2585583ed..e8bdc430f97 100644 --- a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py +++ b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "5c0a1c787128" -down_revision = "0.91.2" +down_revision = "d203788f82b9" branch_labels = None depends_on = None diff --git a/src/zenml/zen_stores/schemas/step_run_schemas.py b/src/zenml/zen_stores/schemas/step_run_schemas.py index da0241d9821..918c8cc3336 100644 --- a/src/zenml/zen_stores/schemas/step_run_schemas.py +++ b/src/zenml/zen_stores/schemas/step_run_schemas.py @@ -188,9 +188,9 @@ class StepRunSchema(NamedSchema, RunMetadataInterface, table=True): output_artifacts: List["StepRunOutputArtifactSchema"] = Relationship( sa_relationship_kwargs={"cascade": "delete"} ) - logs: Optional["LogsSchema"] = Relationship( + logs: List["LogsSchema"] = Relationship( back_populates="step_run", - sa_relationship_kwargs={"cascade": "delete", "uselist": False}, + sa_relationship_kwargs={"cascade": "delete"}, ) parents: List["StepRunParentsSchema"] = Relationship( sa_relationship_kwargs={ @@ -447,7 +447,6 @@ def to_model( ) if self.exception_info else None, - logs=self.logs.to_model() if self.logs else None, snapshot_id=self.snapshot_id, pipeline_run_id=self.pipeline_run_id, original_step_run_id=self.original_step_run_id, @@ -485,9 +484,18 @@ def to_model( output_artifact.artifact_version.to_model() ) + # Add the step logs as "logs" if they exist, for backwards compatibility + # TODO: This will be safe to remove in future releases (>0.84.0). + step_logs = [ + log_entry + for log_entry in self.logs + if log_entry.source == "step" + ] resources = StepRunResponseResources( user=self.user.to_model() if self.user else None, model_version=model_version, + logs=step_logs[0].to_model() if step_logs else None, + log_collection=[log.to_model() for log in self.logs], inputs=input_artifacts, outputs=output_artifacts, ) From 56eefb8a522c075701779d4b44c305168984d41c Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Mon, 1 Dec 2025 14:12:25 +0100 Subject: [PATCH 48/81] Intermediate fixes for scalability --- src/zenml/artifacts/utils.py | 4 +- src/zenml/deployers/server/service.py | 49 +++++++++++-------- .../artifact/artifact_log_exporter.py | 9 +++- .../log_stores/artifact/artifact_log_store.py | 47 +++++++++++------- src/zenml/orchestrators/step_launcher.py | 19 +------ src/zenml/stack/stack.py | 47 ++++++------------ src/zenml/stack/stack_component.py | 11 ++++- src/zenml/utils/logging_utils.py | 35 +++---------- src/zenml/zen_server/download_utils.py | 4 +- .../schemas/pipeline_run_schemas.py | 2 +- .../functional/zen_stores/test_zen_store.py | 4 +- 11 files changed, 104 insertions(+), 127 deletions(-) diff --git a/src/zenml/artifacts/utils.py b/src/zenml/artifacts/utils.py index a8e906d5bf6..c1a9c9f83be 100644 --- a/src/zenml/artifacts/utils.py +++ b/src/zenml/artifacts/utils.py @@ -539,7 +539,7 @@ def load_artifact_visualization( f"Artifact '{artifact.id}' cannot be visualized because the " "underlying artifact store was deleted." ) - artifact_store = _load_artifact_store( + artifact_store = load_artifact_store( artifact_store_id=artifact.artifact_store_id, zen_store=zen_store ) try: @@ -821,7 +821,7 @@ def _load_artifact_from_uri( return artifact -def _load_artifact_store( +def load_artifact_store( artifact_store_id: Union[str, "UUID"], zen_store: Optional["BaseZenStore"] = None, ) -> "BaseArtifactStore": diff --git a/src/zenml/deployers/server/service.py b/src/zenml/deployers/server/service.py index 97a1ef0f47d..a3dac1d5e70 100644 --- a/src/zenml/deployers/server/service.py +++ b/src/zenml/deployers/server/service.py @@ -16,6 +16,7 @@ import time import traceback from abc import ABC, abstractmethod +from contextlib import nullcontext from datetime import datetime, timezone from typing import ( TYPE_CHECKING, @@ -63,6 +64,7 @@ from zenml.stack import Stack from zenml.steps.utils import get_unique_step_output_names from zenml.utils import env_utils, source_utils +from zenml.utils.logging_utils import LoggingContext, generate_logs_request from zenml.zen_stores.rest_zen_store import RestZenStore if TYPE_CHECKING: @@ -313,6 +315,9 @@ def initialize(self) -> None: ) self._client.zen_store.reinitialize_session() + # Instantiate the active stack here to avoid race conditions later + self._client.active_stack.validate() + # Execution tracking self.service_start_time = time.time() self.last_execution_time: Optional[datetime] = None @@ -382,7 +387,7 @@ def execute_pipeline( ) except Exception as e: - logger.error(f"❌ Pipeline execution failed: {e}") + logger.exception("❌ Pipeline execution failed") return self._build_response( placeholder_run=placeholder_run, mapped_outputs=None, @@ -516,16 +521,15 @@ def _prepare_execute_with_orchestrator( deployment_snapshot = self._client.zen_store.create_snapshot( deployment_snapshot_request ) - + logs_request = generate_logs_request(source="deployment") # Create a placeholder run using the new deployment snapshot placeholder_run = run_utils.create_placeholder_run( snapshot=deployment_snapshot, - logs=None, + logs=logs_request, trigger_info=PipelineRunTriggerInfo( deployment_id=self.deployment.id, ), ) - return placeholder_run, deployment_snapshot def _execute_with_orchestrator( @@ -576,23 +580,28 @@ def _execute_with_orchestrator( ) captured_outputs: Optional[Dict[str, Dict[str, Any]]] = None - try: - # Use the new deployment snapshot with pre-configured settings - orchestrator.run( - snapshot=deployment_snapshot, - stack=active_stack, - placeholder_run=placeholder_run, - ) + logging_context = nullcontext() + if placeholder_run.logs: + logging_context = LoggingContext(log_model=placeholder_run.logs) - # Capture in-memory outputs before stopping the runtime context - if runtime.is_active(): - captured_outputs = runtime.get_outputs() - except Exception as e: - logger.exception(f"Failed to execute pipeline: {e}") - raise RuntimeError(f"Failed to execute pipeline: {e}") - finally: - # Always stop deployment runtime context - runtime.stop() + with logging_context: + try: + # Use the new deployment snapshot with pre-configured settings + orchestrator.run( + snapshot=deployment_snapshot, + stack=active_stack, + placeholder_run=placeholder_run, + ) + + # Capture in-memory outputs before stopping the runtime context + if runtime.is_active(): + captured_outputs = runtime.get_outputs() + except Exception as e: + logger.exception(f"Failed to execute pipeline: {e}") + raise RuntimeError(f"Failed to execute pipeline: {e}") + finally: + # Always stop deployment runtime context + runtime.stop() return captured_outputs diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 11d6776f433..116d8d8ae3e 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -252,6 +252,9 @@ def _write( content = "\n".join(log_lines) + "\n" if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + if not self.artifact_store.exists(log_model.uri): + self.artifact_store.makedirs(log_model.uri) + timestamp = time.time() file_uri = os.path.join( log_model.uri, @@ -264,6 +267,10 @@ def _write( if last: self._merge(log_model) else: + logs_base_uri = os.path.dirname(log_model.uri) + if not self.artifact_store.exists(logs_base_uri): + self.artifact_store.makedirs(logs_base_uri) + with self.artifact_store.open(log_model.uri, "a") as f: f.write(content) @@ -275,8 +282,6 @@ def _write( except Exception as e: logger.error(f"Failed to write logs to {log_model.uri}: {e}") raise - finally: - self.artifact_store.cleanup() def _merge(self, log_model: "LogsResponse"): """Merges all log files into one in the given URI. diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index f96caf6bad7..e17a1c7e14f 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -29,9 +29,9 @@ from opentelemetry.sdk._logs.export import LogExporter from zenml.artifact_stores import BaseArtifactStore -from zenml.artifacts.utils import _load_artifact_store +from zenml.artifacts.utils import load_artifact_store from zenml.client import Client -from zenml.enums import LoggingLevels +from zenml.enums import LoggingLevels, StackComponentType from zenml.exceptions import DoesNotExistException from zenml.log_stores.artifact.artifact_log_store_flavor import ( ArtifactLogStoreConfig, @@ -67,25 +67,10 @@ def prepare_logs_uri( """ logs_base_uri = os.path.join(artifact_store.path, "logs") - if not artifact_store.exists(logs_base_uri): - artifact_store.makedirs(logs_base_uri) - if artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - logs_uri = os.path.join(logs_base_uri, log_id) - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs directory {logs_uri} already exists! Removing old log directory..." - ) - artifact_store.rmtree(logs_uri) - - artifact_store.makedirs(logs_uri) + logs_uri = os.path.join(logs_base_uri, str(log_id)) else: logs_uri = os.path.join(logs_base_uri, f"{log_id}{LOGS_EXTENSION}") - if artifact_store.exists(logs_uri): - logger.warning( - f"Logs file {logs_uri} already exists! Removing old log file..." - ) - artifact_store.remove(logs_uri) return sanitize_remote_path(logs_uri) @@ -152,7 +137,7 @@ def _stream_logs_line_by_line( Raises: DoesNotExistException: If the artifact does not exist in the artifact store. """ - artifact_store = _load_artifact_store(artifact_store_id, zen_store) + artifact_store = load_artifact_store(artifact_store_id, zen_store) try: if not artifact_store.isdir(logs_uri): @@ -245,6 +230,30 @@ def __init__( super().__init__(*args, **kwargs) self._artifact_store = artifact_store + @classmethod + def from_artifact_store( + cls, artifact_store: "BaseArtifactStore" + ) -> "ArtifactLogStore": + """Creates an artifact log store from an artifact store. + + Args: + artifact_store: The artifact store to create the log store from. + + Returns: + The created artifact log store. + """ + return cls( + artifact_store=artifact_store, + id=artifact_store.id, + name="default", + config=ArtifactLogStoreConfig(), + flavor="artifact", + type=StackComponentType.LOG_STORE, + user=artifact_store.user, + created=artifact_store.created, + updated=artifact_store.updated, + ) + @property def config(self) -> ArtifactLogStoreConfig: """Returns the configuration of the artifact log store. diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index a5a2f86bfbe..40a815a020d 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -22,9 +22,7 @@ from zenml.config.step_configurations import Step from zenml.config.step_run_info import StepRunInfo from zenml.constants import ( - ENV_ZENML_DISABLE_STEP_LOGS_STORAGE, ENV_ZENML_STEP_OPERATOR, - handle_bool_env_var, ) from zenml.enums import ExecutionMode, ExecutionStatus, StepRuntime from zenml.environment import get_run_environment_dict @@ -43,7 +41,6 @@ from zenml.stack import Stack from zenml.utils import env_utils, exception_utils, string_utils from zenml.utils.logging_utils import ( - generate_logs_request, is_step_logging_enabled, setup_step_logging, ) @@ -261,19 +258,6 @@ def launch(self) -> StepRunResponse: publish_utils.step_exception_info.set(None) pipeline_run, run_was_created = self._create_or_reuse_run() - # Enable or disable step logs storage - if handle_bool_env_var(ENV_ZENML_DISABLE_STEP_LOGS_STORAGE, False): - step_logging_enabled = False - else: - step_logging_enabled = orchestrator_utils.is_setting_enabled( - is_enabled_on_step=self._step.config.enable_step_logs, - is_enabled_on_pipeline=self._snapshot.pipeline_configuration.enable_step_logs, - ) - - logs_request = None - if step_logging_enabled: - logs_request = generate_logs_request(source="step") - if run_was_created: pipeline_run_metadata = self._stack.get_pipeline_run_metadata( run_id=pipeline_run.id @@ -297,7 +281,6 @@ def launch(self) -> StepRunResponse: invocation_id=self._invocation_id, dynamic_config=dynamic_config, ) - step_run_request.logs = logs_request try: request_factory.populate_request(request=step_run_request) @@ -327,7 +310,7 @@ def launch(self) -> StepRunResponse: ): logs_context = setup_step_logging( step_run=step_run, - source="step", + source="prepare_step", ) start_time = time.time() diff --git a/src/zenml/stack/stack.py b/src/zenml/stack/stack.py index a7912bc6773..647cacf48b1 100644 --- a/src/zenml/stack/stack.py +++ b/src/zenml/stack/stack.py @@ -532,41 +532,14 @@ def deployer(self) -> Optional["BaseDeployer"]: def log_store(self) -> "BaseLogStore": """The log store of the stack. - If no log store is configured, returns a temporary default - ArtifactLogStore. - Returns: The log store of the stack. """ - if self._log_store: - return self._log_store - else: - from uuid import uuid4 + if not self._log_store: + self.validate_log_store() - from zenml.log_stores import ( - ArtifactLogStore, - ArtifactLogStoreConfig, - ArtifactLogStoreFlavor, - ) - - flavor = ArtifactLogStoreFlavor() - now = utc_now() - - self._log_store = ArtifactLogStore( - id=uuid4(), - name="default", - flavor=flavor.name, - type=flavor.type, - config=ArtifactLogStoreConfig(), - environment={}, - user=Client().active_user.id, - created=now, - updated=now, - secrets=[], - # Here, we tie the artifact log store to the artifact store - artifact_store=self.artifact_store, - ) - return self._log_store + assert self._log_store is not None + return self._log_store def dict(self) -> Dict[str, str]: """Converts the stack into a dictionary. @@ -820,6 +793,7 @@ def validate( return self.validate_image_builder() + self.validate_log_store() for component in self.components.values(): if component.validator: component.validator.validate(stack=self) @@ -873,6 +847,17 @@ def validate_image_builder(self) -> None: self._image_builder = image_builder + def validate_log_store(self) -> None: + """Validates that the stack has a log store.""" + from zenml.log_stores import ArtifactLogStore + + if self._log_store: + return + + self._log_store = ArtifactLogStore.from_artifact_store( + self.artifact_store + ) + def prepare_pipeline_submission( self, snapshot: "PipelineSnapshotResponse" ) -> None: diff --git a/src/zenml/stack/stack_component.py b/src/zenml/stack/stack_component.py index 284592c44f4..467afb10dc2 100644 --- a/src/zenml/stack/stack_component.py +++ b/src/zenml/stack/stack_component.py @@ -18,7 +18,16 @@ from collections.abc import Mapping, Sequence from datetime import datetime from inspect import isclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Type, Union +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Optional, + Set, + Type, + Union, +) from uuid import UUID from pydantic import BaseModel, ConfigDict, model_validator diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 4a38c16e714..dd7de007e0b 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -45,7 +45,6 @@ ) from zenml.orchestrators.utils import is_setting_enabled from zenml.stack import StackComponent -from zenml.utils.time_utils import utc_now if TYPE_CHECKING: from zenml.zen_stores.base_zen_store import BaseZenStore @@ -395,6 +394,7 @@ def fetch_logs( DoesNotExistException: If the log store doesn't exist or is not the right type. NotImplementedError: If the log store's dependencies are not installed. """ + from zenml.artifacts.utils import load_artifact_store from zenml.log_stores.base_log_store import BaseLogStore log_store: Optional[BaseLogStore] = None @@ -422,38 +422,15 @@ def fetch_logs( f"Log store '{log_store_model.name}' could not be " "instantiated." ) - else: - from zenml.artifact_stores.base_artifact_store import BaseArtifactStore + elif logs.artifact_store_id: from zenml.log_stores.artifact.artifact_log_store import ( ArtifactLogStore, ) - from zenml.log_stores.artifact.artifact_log_store_flavor import ( - ArtifactLogStoreConfig, - ) - current_time = utc_now() - - artifact_store = zen_store.get_stack_component(logs.artifact_store_id) - if not artifact_store.type == StackComponentType.ARTIFACT_STORE: - raise DoesNotExistException( - f"Stack component '{logs.artifact_store_id}' is not an artifact store." - ) + artifact_store = load_artifact_store(logs.artifact_store_id, zen_store) + log_store = ArtifactLogStore.from_artifact_store(artifact_store) - artifact_store = cast( - "BaseArtifactStore", - StackComponent.from_model(artifact_store), - ) - log_store = ArtifactLogStore( - name="default_artifact_log_store", - id=uuid4(), - config=ArtifactLogStoreConfig(), - flavor="artifact", - type=StackComponentType.LOG_STORE, - user=uuid4(), - created=current_time, - updated=current_time, - # Here, we tie the artifact log store to the artifact store - artifact_store=artifact_store, - ) + else: + return [] return log_store.fetch(logs_model=logs, limit=limit) diff --git a/src/zenml/zen_server/download_utils.py b/src/zenml/zen_server/download_utils.py index c722e7b87bf..130e98305f1 100644 --- a/src/zenml/zen_server/download_utils.py +++ b/src/zenml/zen_server/download_utils.py @@ -18,7 +18,7 @@ import tempfile from typing import TYPE_CHECKING -from zenml.artifacts.utils import _load_artifact_store +from zenml.artifacts.utils import load_artifact_store from zenml.exceptions import IllegalOperationError from zenml.models import ( ArtifactVersionResponse, @@ -51,7 +51,7 @@ def verify_artifact_is_downloadable( "underlying artifact store was deleted." ) - artifact_store = _load_artifact_store( + artifact_store = load_artifact_store( artifact_store_id=artifact.artifact_store_id, zen_store=zen_store() ) diff --git a/src/zenml/zen_stores/schemas/pipeline_run_schemas.py b/src/zenml/zen_stores/schemas/pipeline_run_schemas.py index 4f7a869a4b5..c8c2a4d69ed 100644 --- a/src/zenml/zen_stores/schemas/pipeline_run_schemas.py +++ b/src/zenml/zen_stores/schemas/pipeline_run_schemas.py @@ -611,7 +611,7 @@ def to_model( client_logs = [ log_entry for log_entry in self.logs - if log_entry.source == "client" + if log_entry.source in ["client", "deployment"] ] if self.snapshot: diff --git a/tests/integration/functional/zen_stores/test_zen_store.py b/tests/integration/functional/zen_stores/test_zen_store.py index c7a5dbb9e59..daa17bd3c36 100644 --- a/tests/integration/functional/zen_stores/test_zen_store.py +++ b/tests/integration/functional/zen_stores/test_zen_store.py @@ -50,7 +50,7 @@ ) from zenml import Model, Tag, add_tags, log_metadata, pipeline, step from zenml.artifacts.utils import ( - _load_artifact_store, + load_artifact_store, ) from zenml.client import Client from zenml.code_repositories import BaseCodeRepository @@ -3280,7 +3280,7 @@ def test_logs_are_recorded_properly_when_disabled(clean_client): artifact_store_id = steps[0].output.artifact_store_id assert artifact_store_id - artifact_store = _load_artifact_store(artifact_store_id, store) + artifact_store = load_artifact_store(artifact_store_id, store) logs_uri_1 = prepare_logs_uri( artifact_store=artifact_store, From 8c249dd79b4a522dd8a7945f72e8306371d83bb7 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Mon, 1 Dec 2025 15:33:21 +0100 Subject: [PATCH 49/81] Update logging for deployers to use utils --- src/zenml/deployers/server/service.py | 12 +++++------- src/zenml/utils/logging_utils.py | 12 ++++-------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/zenml/deployers/server/service.py b/src/zenml/deployers/server/service.py index a3dac1d5e70..7d81dd1a8e3 100644 --- a/src/zenml/deployers/server/service.py +++ b/src/zenml/deployers/server/service.py @@ -16,7 +16,6 @@ import time import traceback from abc import ABC, abstractmethod -from contextlib import nullcontext from datetime import datetime, timezone from typing import ( TYPE_CHECKING, @@ -64,7 +63,7 @@ from zenml.stack import Stack from zenml.steps.utils import get_unique_step_output_names from zenml.utils import env_utils, source_utils -from zenml.utils.logging_utils import LoggingContext, generate_logs_request +from zenml.utils.logging_utils import setup_run_logging from zenml.zen_stores.rest_zen_store import RestZenStore if TYPE_CHECKING: @@ -521,11 +520,9 @@ def _prepare_execute_with_orchestrator( deployment_snapshot = self._client.zen_store.create_snapshot( deployment_snapshot_request ) - logs_request = generate_logs_request(source="deployment") # Create a placeholder run using the new deployment snapshot placeholder_run = run_utils.create_placeholder_run( snapshot=deployment_snapshot, - logs=logs_request, trigger_info=PipelineRunTriggerInfo( deployment_id=self.deployment.id, ), @@ -580,9 +577,10 @@ def _execute_with_orchestrator( ) captured_outputs: Optional[Dict[str, Dict[str, Any]]] = None - logging_context = nullcontext() - if placeholder_run.logs: - logging_context = LoggingContext(log_model=placeholder_run.logs) + logging_context = setup_run_logging( + pipeline_run=placeholder_run, + source="deployment", + ) with logging_context: try: diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index dd7de007e0b..20e35590f3d 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -314,10 +314,8 @@ def setup_run_logging( Returns: The logs context. """ - if orchestrator_logs := search_logs_by_source( - pipeline_run.log_collection, source - ): - return LoggingContext(log_model=orchestrator_logs) + if run_logs := search_logs_by_source(pipeline_run.log_collection, source): + return LoggingContext(log_model=run_logs) logs_request = generate_logs_request(source=source) try: @@ -329,10 +327,8 @@ def setup_run_logging( except Exception as e: logger.error(f"Failed to add logs to the run {pipeline_run.id}: {e}") - if orchestrator_logs := search_logs_by_source( - pipeline_run.log_collection, source - ): - return LoggingContext(log_model=orchestrator_logs) + if run_logs := search_logs_by_source(pipeline_run.log_collection, source): + return LoggingContext(log_model=run_logs) return nullcontext() From a299e4d2034047b9c58839dfd438b78040e4ecc0 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Mon, 1 Dec 2025 16:49:12 +0100 Subject: [PATCH 50/81] Refactored the EOF operation --- .../artifact/artifact_log_exporter.py | 70 ++++++++++++------- src/zenml/log_stores/base_log_store.py | 18 +++++ src/zenml/log_stores/otel/otel_log_store.py | 33 +++++++++ src/zenml/utils/logging_utils.py | 14 +--- 4 files changed, 97 insertions(+), 38 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 116d8d8ae3e..c59b73806c6 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -16,7 +16,7 @@ import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Sequence +from typing import TYPE_CHECKING, Dict, List, Sequence, Set from uuid import UUID, uuid4 from opentelemetry import context as otel_context @@ -28,8 +28,8 @@ remove_ansi_escape_codes, ) from zenml.log_stores.otel.otel_log_store import ( - ZENML_LOGGING_CONTEXT_EXIT_TOKEN, ZENML_OTEL_LOG_STORE_CONTEXT_KEY, + ZENML_OTEL_LOG_STORE_FLUSH_KEY, ) from zenml.logger import get_logger from zenml.models import LogsResponse @@ -72,15 +72,24 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: try: entries_by_id: Dict[UUID, List[str]] = defaultdict(list) responses_by_id: Dict[UUID, "LogsResponse"] = {} + finalized_ids: Set[UUID] = set() for log_data in batch: log_model = otel_context.get_value( key=ZENML_OTEL_LOG_STORE_CONTEXT_KEY, context=log_data.log_record.context, ) + flush = otel_context.get_value( + key=ZENML_OTEL_LOG_STORE_FLUSH_KEY, + context=log_data.log_record.context, + ) if not log_model: continue + if flush: + finalized_ids.add(log_model.id) + continue + responses_by_id[log_model.id] = log_model entries = self._otel_record_to_log_entries(log_data.log_record) @@ -91,13 +100,11 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: for log_id, log_lines in entries_by_id.items(): if log_lines: log_model = responses_by_id[log_id] + self._write(log_lines, log_model) - last = False - if ZENML_LOGGING_CONTEXT_EXIT_TOKEN in log_lines: - last = True - log_lines.pop(-1) - - self._write(log_lines, log_model, last=last) + for log_id in finalized_ids: + log_model = responses_by_id[log_id] + self._finalize(log_model) return LogExportResult.SUCCESS @@ -233,14 +240,12 @@ def _write( self, log_lines: List[str], log_model: "LogsResponse", - last: bool = False, ) -> None: """Write log lines to the artifact store. Args: log_lines: List of JSON-serialized log entries. log_model: The log model. - last: Whether this is the last batch of log lines. """ if not log_model.uri or not log_model.artifact_store_id: logger.warning( @@ -263,9 +268,6 @@ def _write( with self.artifact_store.open(file_uri, "w") as f: f.write(content) - - if last: - self._merge(log_model) else: logs_base_uri = os.path.dirname(log_model.uri) if not self.artifact_store.exists(logs_base_uri): @@ -274,15 +276,37 @@ def _write( with self.artifact_store.open(log_model.uri, "a") as f: f.write(content) - if last: - self.artifact_store._remove_previous_file_versions( - log_model.uri - ) - except Exception as e: logger.error(f"Failed to write logs to {log_model.uri}: {e}") raise + def _finalize( + self, + log_model: "LogsResponse", + ) -> None: + """Finalize the logs for a given log model by merging all log files into one. + + Args: + log_model: The log model. + """ + if not log_model.uri or not log_model.artifact_store_id: + logger.warning( + f"Skipping log finalize: missing uri or artifact_store_id for log {log_model.id}" + ) + return + + try: + if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: + self._merge(log_model) + else: + self.artifact_store._remove_previous_file_versions( + log_model.uri + ) + + except Exception as e: + logger.error(f"Failed to finalize logs for {log_model.uri}: {e}") + raise + def _merge(self, log_model: "LogsResponse"): """Merges all log files into one in the given URI. @@ -313,7 +337,7 @@ def _merge(self, log_model: "LogsResponse"): merged_file.write( str( _load_file_from_artifact_store( - os.path.join(self.logs_uri, str(file)), + os.path.join(log_model.uri, str(file)), artifact_store=self.artifact_store, mode="r", ) @@ -326,13 +350,9 @@ def _merge(self, log_model: "LogsResponse"): for file in files_: if file not in missing_files: self.artifact_store.remove( - os.path.join(self.logs_uri, str(file)) + os.path.join(log_model.uri, str(file)) ) - # Update the last merge time - self.last_merge_time = time.time() - def shutdown(self) -> None: """Shutdown the exporter.""" - # TODO: Merge - pass + self.artifact_store.cleanup() diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index afc18d56d74..f57a5767287 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -72,6 +72,24 @@ def emit( log_model: The log model to emit the log record to. """ + @abstractmethod + def finalize( + self, + log_model: LogsResponse, + ) -> None: + """Finalize the stream of log records associated with a log model. + + This is used to announce the end of the stream of log records associated + with a log model and that no more log records will be emitted. + + The implementation should ensure that all log records associated with + the log model are flushed to the backend and any resources (clients, + connections, file descriptors, etc.) are released. + + Args: + log_model: The log model to finalize. + """ + def register_emitter(self) -> None: """Register an emitter for the log store.""" with self._lock: diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 01ad68c621b..d61751a719f 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -42,6 +42,9 @@ ZENML_OTEL_LOG_STORE_CONTEXT_KEY = otel_context.create_key( "zenml.logging_context" ) +ZENML_OTEL_LOG_STORE_FLUSH_KEY = otel_context.create_key( + "zenml.log_store_flush" +) class OtelLogStore(BaseLogStore): @@ -154,6 +157,36 @@ def emit( context=ctx, ) + def finalize( + self, + log_model: LogsResponse, + ) -> None: + """Finalize the stream of log records associated with a log model. + + Args: + log_model: The log model to finalize. + """ + with self._lock: + if not self._provider: + return + + # Attach the log_model to OTel's context so the exporter + # can access it in the background processor thread + ctx = otel_context.set_value( + ZENML_OTEL_LOG_STORE_CONTEXT_KEY, log_model + ) + ctx = otel_context.set_value( + ZENML_OTEL_LOG_STORE_FLUSH_KEY, True, context=ctx + ) + + otel_logger = self._provider.get_logger( + "zenml.log_store.flush", + schema_url=None, + ) + otel_logger.emit( + context=ctx, + ) + def flush(self) -> None: """Flush the log store. diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 20e35590f3d..da6d4f96bc2 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -56,8 +56,6 @@ "active_logging_context", default=None ) -ZENML_LOGGING_CONTEXT_EXIT_TOKEN = "__ZENML_LOGGING_CONTEXT_EXIT__" - class LogEntry(BaseModel): """A structured log entry with parsed information. @@ -186,17 +184,7 @@ def __exit__( ) ) - LoggingContext.emit( - logging.LogRecord( - name="", - level=logging.INFO, - msg=ZENML_LOGGING_CONTEXT_EXIT_TOKEN, - args=(), - exc_info=None, - pathname="", - lineno=0, - ) - ) + self._log_store.finalize(self.log_model) with self._lock: active_logging_context.set(self._previous_context) From 0e10659a168a75fbb7dd6d0a572d5a381e557c32 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Mon, 1 Dec 2025 17:50:49 +0100 Subject: [PATCH 51/81] Fix first round of bugs after last changes --- .../artifact/artifact_log_exporter.py | 9 ++--- src/zenml/utils/logging_utils.py | 5 ++- src/zenml/zen_stores/sql_zen_store.py | 37 +++++++++++++++++++ 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index c59b73806c6..c5d63469512 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -16,7 +16,7 @@ import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Sequence, Set +from typing import TYPE_CHECKING, Dict, List, Sequence from uuid import UUID, uuid4 from opentelemetry import context as otel_context @@ -72,7 +72,7 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: try: entries_by_id: Dict[UUID, List[str]] = defaultdict(list) responses_by_id: Dict[UUID, "LogsResponse"] = {} - finalized_ids: Set[UUID] = set() + finalized_log_streams: List["LogsResponse"] = [] for log_data in batch: log_model = otel_context.get_value( @@ -87,7 +87,7 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: continue if flush: - finalized_ids.add(log_model.id) + finalized_log_streams.append(log_model) continue responses_by_id[log_model.id] = log_model @@ -102,8 +102,7 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: log_model = responses_by_id[log_id] self._write(log_lines, log_model) - for log_id in finalized_ids: - log_model = responses_by_id[log_id] + for log_model in finalized_log_streams: self._finalize(log_model) return LogExportResult.SUCCESS diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index da6d4f96bc2..778218880fd 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -43,8 +43,6 @@ StepRunResponse, StepRunUpdate, ) -from zenml.orchestrators.utils import is_setting_enabled -from zenml.stack import StackComponent if TYPE_CHECKING: from zenml.zen_stores.base_zen_store import BaseZenStore @@ -260,6 +258,8 @@ def is_step_logging_enabled( Returns: True if logging is enabled, False if disabled. """ + from zenml.orchestrators.utils import is_setting_enabled + if handle_bool_env_var(ENV_ZENML_DISABLE_STEP_LOGS_STORAGE, False): return False else: @@ -380,6 +380,7 @@ def fetch_logs( """ from zenml.artifacts.utils import load_artifact_store from zenml.log_stores.base_log_store import BaseLogStore + from zenml.stack import StackComponent log_store: Optional[BaseLogStore] = None diff --git a/src/zenml/zen_stores/sql_zen_store.py b/src/zenml/zen_stores/sql_zen_store.py index d245fa8254e..4d9ffa4ee93 100644 --- a/src/zenml/zen_stores/sql_zen_store.py +++ b/src/zenml/zen_stores/sql_zen_store.py @@ -10514,6 +10514,43 @@ def update_run_step( session=session, ) + # Add logs if specified + if step_run_update.add_logs: + try: + for log_request in step_run_update.add_logs: + # Validate the artifact store exists + self._get_reference_schema_by_id( + resource=log_request, + reference_schema=StackComponentSchema, + reference_id=log_request.artifact_store_id, + session=session, + reference_type="logs artifact store", + ) + + # Create the log entry + log_entry = LogsSchema( + id=log_request.id, + uri=log_request.uri, + # TODO: Remove fallback when not supporting + # clients <0.84.0 anymore + source=log_request.source or "execution", + step_run_id=existing_step_run.id, + artifact_store_id=log_request.artifact_store_id, + log_store_id=log_request.log_store_id, + ) + session.add(log_entry) + + session.commit() + except IntegrityError: + session.rollback() + raise EntityExistsError( + "Unable to create log entry: One of the provided sources " + f"({', '.join(log.source for log in step_run_update.add_logs)}) " + "already exists within the scope of the same step " + f"'{step_run_id}'. Existing entry sources: " + f"{', '.join(log.source for log in existing_step_run.logs)}" + ) + return existing_step_run.to_model( include_metadata=True, include_resources=True ) From 49d8eac7d44c69f7bc573b01f2cb8b05940589ec Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 2 Dec 2025 10:21:48 +0100 Subject: [PATCH 52/81] formatting --- src/zenml/execution/pipeline/dynamic/runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index b7d4ad166df..0ea8e46e422 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -57,7 +57,6 @@ from zenml.models import ( ArtifactVersionResponse, PipelineRunResponse, - PipelineRunUpdate, PipelineSnapshotResponse, ) from zenml.orchestrators.publish_utils import ( From 7aab5d5d354b13f17f030509c415f8aae80a9f38 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 2 Dec 2025 11:19:27 +0100 Subject: [PATCH 53/81] formatting, linting, docstrings and tests --- .../execution/pipeline/dynamic/runner.py | 1 - .../artifact/artifact_log_exporter.py | 35 ++++++++---- .../log_stores/artifact/artifact_log_store.py | 1 - src/zenml/log_stores/base_log_store.py | 1 - .../datadog/datadog_log_exporter.py | 8 +-- .../log_stores/datadog/datadog_log_store.py | 1 - src/zenml/log_stores/otel/otel_log_store.py | 23 ++++---- src/zenml/logger.py | 23 ++++++-- src/zenml/utils/logging_utils.py | 25 +++++---- .../versions/5c0a1c787128_add_log_stores.py | 2 +- src/zenml/zen_stores/sql_zen_store.py | 1 + .../functional/zen_stores/test_zen_store.py | 54 +++---------------- 12 files changed, 89 insertions(+), 86 deletions(-) diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index 0ea8e46e422..884e9c15652 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -168,7 +168,6 @@ def run_pipeline(self) -> None: ): logging_context = setup_run_logging( pipeline_run=run, - snapshot=self._snapshot, source="orchestrator", ) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index c5d63469512..1303be2e90c 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -16,7 +16,7 @@ import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Sequence +from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, cast from uuid import UUID, uuid4 from opentelemetry import context as otel_context @@ -75,9 +75,12 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: finalized_log_streams: List["LogsResponse"] = [] for log_data in batch: - log_model = otel_context.get_value( - key=ZENML_OTEL_LOG_STORE_CONTEXT_KEY, - context=log_data.log_record.context, + log_model = cast( + Optional["LogsResponse"], + otel_context.get_value( + key=ZENML_OTEL_LOG_STORE_CONTEXT_KEY, + context=log_data.log_record.context, + ), ) flush = otel_context.get_value( key=ZENML_OTEL_LOG_STORE_FLUSH_KEY, @@ -92,7 +95,7 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: responses_by_id[log_model.id] = log_model - entries = self._otel_record_to_log_entries(log_data.log_record) + entries = self._otel_record_to_log_entries(log_data) for entry in entries: json_line = entry.model_dump_json(exclude_none=True) entries_by_id[log_model.id].append(json_line) @@ -112,20 +115,25 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: return LogExportResult.FAILURE def _otel_record_to_log_entries( - self, log_record: "LogData" + self, log_data: "LogData" ) -> List[LogEntry]: """Convert an OTEL log record to ZenML LogEntry objects. Args: - log_record: The OpenTelemetry log record. + log_data: The OpenTelemetry log data. Returns: List of LogEntry objects (multiple if message was chunked). """ + log_record = log_data.log_record message = str(log_record.body) if log_record.body else "" message = remove_ansi_escape_codes(message).rstrip() - level = self._map_severity_to_level(log_record.severity_text) + level = ( + self._map_severity_to_level(log_record.severity_text) + if log_record.severity_text + else None + ) name = "unknown" module = None @@ -245,6 +253,9 @@ def _write( Args: log_lines: List of JSON-serialized log entries. log_model: The log model. + + Raises: + Exception: If the log lines cannot be written to the artifact store. """ if not log_model.uri or not log_model.artifact_store_id: logger.warning( @@ -287,6 +298,9 @@ def _finalize( Args: log_model: The log model. + + Raises: + Exception: If the logs cannot be finalized. """ if not log_model.uri or not log_model.artifact_store_id: logger.warning( @@ -306,7 +320,7 @@ def _finalize( logger.error(f"Failed to finalize logs for {log_model.uri}: {e}") raise - def _merge(self, log_model: "LogsResponse"): + def _merge(self, log_model: "LogsResponse") -> None: """Merges all log files into one in the given URI. Called on the logging context exit. @@ -319,6 +333,9 @@ def _merge(self, log_model: "LogsResponse"): from zenml.artifacts.utils import _load_file_from_artifact_store from zenml.exceptions import DoesNotExistException + if not log_model.uri: + raise ValueError("Log model has no URI, cannot merge logs.") + files_ = self.artifact_store.listdir(log_model.uri) if len(files_) > 1: files_.sort() diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index e17a1c7e14f..649b1e15e22 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -289,7 +289,6 @@ def fetch( start_time: Filter logs after this time. end_time: Filter logs before this time. limit: Maximum number of log entries to return. - message_size: Maximum size of a single log message in bytes. Returns: List of log entries from the artifact store. diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index f57a5767287..8cfb710c3b0 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -128,7 +128,6 @@ def fetch( start_time: Filter logs after this time. end_time: Filter logs before this time. limit: Maximum number of log entries to return. - message_size: Maximum size of a single log message in bytes. Returns: List of log entries matching the query. diff --git a/src/zenml/log_stores/datadog/datadog_log_exporter.py b/src/zenml/log_stores/datadog/datadog_log_exporter.py index 81b440b488c..21307578e63 100644 --- a/src/zenml/log_stores/datadog/datadog_log_exporter.py +++ b/src/zenml/log_stores/datadog/datadog_log_exporter.py @@ -13,7 +13,7 @@ # permissions and limitations under the License. """OpenTelemetry exporter that sends logs to Datadog.""" -from typing import Any, List +from typing import Sequence import requests from opentelemetry.sdk._logs import LogData @@ -48,7 +48,7 @@ def __init__( "Content-Type": "application/json", } - def export(self, batch: List[LogData]) -> Any: + def export(self, batch: Sequence["LogData"]) -> LogExportResult: """Export a batch of log records to Datadog. Args: @@ -79,7 +79,9 @@ def export(self, batch: List[LogData]) -> Any: log_entry["status"] = log_record.severity_text.lower() if log_record.timestamp: - log_entry["timestamp"] = int(log_record.timestamp / 1_000_000) + log_entry["timestamp"] = str( + int(log_record.timestamp / 1_000_000) + ) if all_attrs: tags = [f"{k}:{v}" for k, v in all_attrs.items()] diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index f41a0362d83..56bcaee0379 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -79,7 +79,6 @@ def fetch( start_time: Filter logs after this time. end_time: Filter logs before this time. limit: Maximum number of log entries to return. - message_size: Maximum size of a single log message in bytes. Returns: List of log entries from Datadog. diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index d61751a719f..38bdbc68f75 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -113,11 +113,17 @@ def emit( Args: record: The log record to process. log_model: The log model to emit the log record to. + + Raises: + RuntimeError: If the OpenTelemetry provider is not initialized. """ with self._lock: if not self._provider: self.activate() + if self._provider is None: + raise RuntimeError("OpenTelemetry provider is not initialized") + # Attach the log_model to OTel's context so the exporter # can access it in the background processor thread ctx = otel_context.set_value( @@ -195,7 +201,7 @@ def flush(self) -> None: if self._processor: self._processor.force_flush() - def _get_severity_number(self, levelno: int) -> int: + def _get_severity_number(self, levelno: int) -> SeverityNumber: """Map Python log level to OTEL severity number. Args: @@ -205,17 +211,17 @@ def _get_severity_number(self, levelno: int) -> int: OTEL severity number. """ if levelno >= logging.CRITICAL: - return SeverityNumber.FATAL.value + return SeverityNumber.FATAL elif levelno >= logging.ERROR: - return SeverityNumber.ERROR.value + return SeverityNumber.ERROR elif levelno >= logging.WARNING: - return SeverityNumber.WARN.value + return SeverityNumber.WARN elif levelno >= logging.INFO: - return SeverityNumber.INFO.value + return SeverityNumber.INFO elif levelno >= logging.DEBUG: - return SeverityNumber.DEBUG.value + return SeverityNumber.DEBUG else: - return SeverityNumber.UNSPECIFIED.value + return SeverityNumber.UNSPECIFIED def deactivate(self) -> None: """Deactivate log collection and shut down the processor. @@ -231,7 +237,7 @@ def deactivate(self) -> None: logger.warning(f"Error flushing logs: {e}") try: - self._processor.shutdown() + self._processor.shutdown() # type: ignore[no-untyped-call] logger.debug("Shut down log processor and background thread") except Exception as e: logger.warning(f"Error shutting down processor: {e}") @@ -257,7 +263,6 @@ def fetch( start_time: Filter logs after this time. end_time: Filter logs before this time. limit: Maximum number of log entries to return. - message_size: Maximum size of a single log message in bytes. Returns: List of log entries from the backend. diff --git a/src/zenml/logger.py b/src/zenml/logger.py index ff950737b84..bf386495cd4 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -52,7 +52,7 @@ class _ZenMLStdoutStream: preventing duplicate log entries in stored logs. """ - def write(self, text: str) -> int: + def write(self, text: str) -> Any: """Write text to the original stdout. Args: @@ -246,10 +246,25 @@ def set_root_verbosity() -> None: def _wrapped_write(original_write: Any, stream_name: str) -> Any: - """Wrap stdout/stderr write method to route logs to LoggingContext.""" + """Wrap stdout/stderr write method to route logs to LoggingContext. - def wrapped_write(text: str) -> int: - """Write method that routes logs through LoggingContext.""" + Args: + original_write: The original write method. + stream_name: The name of the stream. + + Returns: + The wrapped write method. + """ + + def wrapped_write(text: str) -> Any: + """Write method that routes logs through LoggingContext. + + Args: + text: The text to write. + + Returns: + The result of the original write method. + """ from zenml.utils.logging_utils import LoggingContext level_int = logging.INFO if stream_name == "stdout" else logging.ERROR diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 778218880fd..4409aa3eab3 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -296,14 +296,16 @@ def setup_run_logging( Args: pipeline_run: The pipeline run. - snapshot: The snapshot of the pipeline run. source: The source of the logs. Returns: The logs context. """ - if run_logs := search_logs_by_source(pipeline_run.log_collection, source): - return LoggingContext(log_model=run_logs) + if pipeline_run.log_collection is not None: + if run_logs := search_logs_by_source( + pipeline_run.log_collection, source + ): + return LoggingContext(log_model=run_logs) logs_request = generate_logs_request(source=source) try: @@ -315,8 +317,11 @@ def setup_run_logging( except Exception as e: logger.error(f"Failed to add logs to the run {pipeline_run.id}: {e}") - if run_logs := search_logs_by_source(pipeline_run.log_collection, source): - return LoggingContext(log_model=run_logs) + if pipeline_run.log_collection is not None: + if run_logs := search_logs_by_source( + pipeline_run.log_collection, source + ): + return LoggingContext(log_model=run_logs) return nullcontext() @@ -336,8 +341,9 @@ def setup_step_logging( Returns: The logs context. """ - if step_logs := search_logs_by_source(step_run.log_collection, source): - return LoggingContext(log_model=step_logs) + if step_run.log_collection is not None: + if step_logs := search_logs_by_source(step_run.log_collection, source): + return LoggingContext(log_model=step_logs) logs_request = generate_logs_request(source=source) try: @@ -349,8 +355,9 @@ def setup_step_logging( except Exception as e: logger.error(f"Failed to add logs to the step run {step_run.id}: {e}") - if step_logs := search_logs_by_source(step_run.log_collection, source): - return LoggingContext(log_model=step_logs) + if step_run.log_collection is not None: + if step_logs := search_logs_by_source(step_run.log_collection, source): + return LoggingContext(log_model=step_logs) return nullcontext() diff --git a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py index e8bdc430f97..540c873849a 100644 --- a/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py +++ b/src/zenml/zen_stores/migrations/versions/5c0a1c787128_add_log_stores.py @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "5c0a1c787128" -down_revision = "d203788f82b9" +down_revision = "0.92.0" branch_labels = None depends_on = None diff --git a/src/zenml/zen_stores/sql_zen_store.py b/src/zenml/zen_stores/sql_zen_store.py index 4d9ffa4ee93..91b26c754ec 100644 --- a/src/zenml/zen_stores/sql_zen_store.py +++ b/src/zenml/zen_stores/sql_zen_store.py @@ -10436,6 +10436,7 @@ def update_run_step( step_run_update: The update to be applied to the step. Raises: + EntityExistsError: If the log entry already exists. ValueError: If trying to update the step status to retried. Returns: diff --git a/tests/integration/functional/zen_stores/test_zen_store.py b/tests/integration/functional/zen_stores/test_zen_store.py index daa17bd3c36..70dc8a58735 100644 --- a/tests/integration/functional/zen_stores/test_zen_store.py +++ b/tests/integration/functional/zen_stores/test_zen_store.py @@ -49,9 +49,6 @@ StubLocalRepositoryContext, ) from zenml import Model, Tag, add_tags, log_metadata, pipeline, step -from zenml.artifacts.utils import ( - load_artifact_store, -) from zenml.client import Client from zenml.code_repositories import BaseCodeRepository from zenml.config.pipeline_configurations import PipelineConfiguration @@ -141,6 +138,7 @@ ) from zenml.utils import code_repository_utils, source_utils from zenml.utils.enum_utils import StrEnum +from zenml.utils.logging_utils import fetch_logs from zenml.zen_stores.rest_zen_store import RestZenStore from zenml.zen_stores.sql_zen_store import SqlZenStore @@ -3247,61 +3245,23 @@ def test_logs_are_recorded_properly(clean_client): with run_context: steps = run_context.steps step1_logs = steps[0].logs - step2_logs = steps[1].logs - step1_logs_content = fetch_log_records( - store, step1_logs.artifact_store_id, step1_logs.uri - ) - step2_logs_content = fetch_log_records( - store, step1_logs.artifact_store_id, step2_logs.uri - ) + step1_logs_content = fetch_logs(step1_logs, store, limit=100) # Step 1 has the word log! Defined in PipelineRunContext assert any("log" in record.message for record in step1_logs_content) - # Step 2 does not have logs! - assert any( - "Step `int_plus_one_test_step` has started." in record.message - for record in step2_logs_content - ) - -def test_logs_are_recorded_properly_when_disabled(clean_client): - """Tests no logs are stored in the artifact store when disabled""" +def test_logs_dont_exist_when_disabled(clean_client): + """Tests that logs don't exist when disabled.""" client = Client() store = client.zen_store - with PipelineRunContext(2, enable_step_logs=False): + with PipelineRunContext(num_runs=2, enable_step_logs=False): steps = store.list_run_steps(StepRunFilter()) step1_logs = steps[0].logs step2_logs = steps[1].logs - assert not step1_logs - assert not step2_logs - - artifact_store_id = steps[0].output.artifact_store_id - assert artifact_store_id - - artifact_store = load_artifact_store(artifact_store_id, store) - - logs_uri_1 = prepare_logs_uri( - artifact_store=artifact_store, - step_name=steps[0].name, - ) - - logs_uri_2 = prepare_logs_uri( - artifact_store=artifact_store, - step_name=steps[1].name, - ) - - prepare_logs_uri( - artifact_store=artifact_store, - step_name=steps[1].name, - ) - - with pytest.raises(FileNotFoundError): - fetch_log_records(store, artifact_store_id, logs_uri_1) - - with pytest.raises(FileNotFoundError): - fetch_log_records(store, artifact_store_id, logs_uri_2) + assert step1_logs is None + assert step2_logs is None # .--------------------. From c2ca75265f9f27cd18ed000dc3326c2000ae7a58 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 2 Dec 2025 14:03:23 +0100 Subject: [PATCH 54/81] unit tests --- tests/unit/deployers/server/test_service_outputs.py | 1 + tests/unit/stack/conftest.py | 8 ++++++++ tests/unit/stack/test_stack.py | 10 ++++++++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/unit/deployers/server/test_service_outputs.py b/tests/unit/deployers/server/test_service_outputs.py index 53c9b228b46..64c90ac580e 100644 --- a/tests/unit/deployers/server/test_service_outputs.py +++ b/tests/unit/deployers/server/test_service_outputs.py @@ -77,6 +77,7 @@ class _DummyRun: def __init__(self) -> None: self.id = uuid4() self.name = "test-run" + self.log_collection = None class _DummyDeploymentAppRunnerFlavor(BaseDeploymentAppRunnerFlavor): diff --git a/tests/unit/stack/conftest.py b/tests/unit/stack/conftest.py index b7fa3a8b4d2..34e83a60371 100644 --- a/tests/unit/stack/conftest.py +++ b/tests/unit/stack/conftest.py @@ -37,12 +37,20 @@ def stack_with_mock_components(mocker): spec=BaseOrchestrator, type=StackComponentType.ORCHESTRATOR, flavor=MOCK_FLAVOR, + id=uuid4(), + user=uuid4(), + created=datetime.now(), + updated=datetime.now(), ) artifact_store = mocker.Mock( spec=BaseArtifactStore, type=StackComponentType.ARTIFACT_STORE, flavor=MOCK_FLAVOR, path="/", + id=uuid4(), + user=uuid4(), + created=datetime.now(), + updated=datetime.now(), ) orchestrator.config.required_secrets = set() orchestrator.settings_class = None diff --git a/tests/unit/stack/test_stack.py b/tests/unit/stack/test_stack.py index 25cadd47af0..267284964bf 100644 --- a/tests/unit/stack/test_stack.py +++ b/tests/unit/stack/test_stack.py @@ -85,7 +85,10 @@ def test_stack_returns_all_its_components( StackComponentType.ORCHESTRATOR: local_orchestrator, StackComponentType.ARTIFACT_STORE: local_artifact_store, } - assert stack.components == expected_components + assert all( + stack.components[component_type] == component + for component_type, component in expected_components.items() + ) # check that it also works with optional container registry stack = Stack( @@ -100,7 +103,10 @@ def test_stack_returns_all_its_components( local_container_registry ) - assert stack.components == expected_components + assert all( + stack.components[component_type] == component + for component_type, component in expected_components.items() + ) def test_stack_requirements(stack_with_mock_components): From a3829b5e5168a461cef7ab46b7fd129324a61f63 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 2 Dec 2025 14:04:52 +0100 Subject: [PATCH 55/81] docstrings --- src/zenml/log_stores/artifact/artifact_log_exporter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 1303be2e90c..94475d34c2b 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -327,6 +327,9 @@ def _merge(self, log_model: "LogsResponse") -> None: Args: log_model: The log model. + + Raises: + RuntimeError: If the log model has no URI, cannot merge logs. """ # If the artifact store is immutable, merge the log files if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: @@ -334,7 +337,7 @@ def _merge(self, log_model: "LogsResponse") -> None: from zenml.exceptions import DoesNotExistException if not log_model.uri: - raise ValueError("Log model has no URI, cannot merge logs.") + raise RuntimeError("Log model has no URI, cannot merge logs.") files_ = self.artifact_store.listdir(log_model.uri) if len(files_) > 1: From b24b7bf1dd231109ad431392fcc348fe44a00a36 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 2 Dec 2025 14:08:09 +0100 Subject: [PATCH 56/81] removed old tests --- .../functional/steps/test_logging.py | 108 ------------------ 1 file changed, 108 deletions(-) delete mode 100644 tests/integration/functional/steps/test_logging.py diff --git a/tests/integration/functional/steps/test_logging.py b/tests/integration/functional/steps/test_logging.py deleted file mode 100644 index 97de449e7d9..00000000000 --- a/tests/integration/functional/steps/test_logging.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import time -from unittest.mock import patch - -from zenml import pipeline, step -from zenml.artifacts.utils import _load_file_from_artifact_store -from zenml.client import Client -from zenml.logger import get_logger - -logger = get_logger(__name__) - -_LOGS_STORAGE_MAX_QUEUE_SIZE = 5 - - -@step(enable_cache=False) -def steps_writing_above_the_count_limit(multi=2): - """A step that writes logs above the count limit.""" - for i in range(_LOGS_STORAGE_MAX_QUEUE_SIZE * multi): - logger.info(f"step 1 - {i}") - time.sleep(0.01) - - -@step(enable_cache=False) -def step_writing_above_the_time_limit(): - """A step that writes logs above the time limit.""" - for i in range(_LOGS_STORAGE_MAX_QUEUE_SIZE): - logger.info(f"step 1 - {i}") - time.sleep(0.01) - - -@patch( - "zenml.artifact_stores.base_artifact_store.BaseArtifactStoreConfig.IS_IMMUTABLE_FILESYSTEM", - True, -) -def test_that_write_buffer_called_multiple_times_on_exceeding_limits(): - """Test that the write buffer is called multiple times on exceeding limits.""" - - @pipeline - def _inner_1(): - steps_writing_above_the_count_limit() - - @pipeline - def _inner_2(): - step_writing_above_the_time_limit() - - with patch( - "zenml.logging.step_logging.LOGS_STORAGE_MAX_QUEUE_SIZE", - _LOGS_STORAGE_MAX_QUEUE_SIZE, - ): - with patch( - "zenml.logging.step_logging.PipelineLogsStorage.write_buffer" - ) as mock_write_buffer: - run_1 = _inner_1() - assert mock_write_buffer.call_count > 1 - - Client().delete_pipeline(run_1.pipeline.id) - - with patch( - "zenml.logging.step_logging.PipelineLogsStorage.write_buffer" - ) as mock_write_buffer: - with patch( - "zenml.logging.step_logging.LOGS_WRITE_INTERVAL_SECONDS", - 0.001, - ): - run_2 = _inner_2() - assert mock_write_buffer.call_count > 1 - Client().delete_pipeline(run_2.pipeline.id) - - -@patch( - "zenml.artifact_stores.base_artifact_store.BaseArtifactStoreConfig.IS_IMMUTABLE_FILESYSTEM", - True, -) -def test_that_small_files_are_merged_together(): - """Test that small files are merged together.""" - - @pipeline - def _inner_1(): - steps_writing_above_the_count_limit(multi=10) - - with patch( - "zenml.logging.step_logging.LOGS_STORAGE_MAX_QUEUE_SIZE", - _LOGS_STORAGE_MAX_QUEUE_SIZE, - ): - ret = _inner_1() # this run will produce 2+ logs files as it go, proven by previous test - - artifact_store = Client().active_stack.artifact_store - files = artifact_store.listdir( - ret.steps["steps_writing_above_the_count_limit"].logs.uri - ) - assert len(files) == 1 - content = str( - _load_file_from_artifact_store( - os.path.join( - ret.steps["steps_writing_above_the_count_limit"].logs.uri, - files[0], - ), - artifact_store, - mode="r", - ) - ).split("\n") - - content_pointer = 0 - for i in range(_LOGS_STORAGE_MAX_QUEUE_SIZE * 10): - while f"step 1 - {i}" not in content[content_pointer]: - content_pointer += 1 - - Client().delete_pipeline(ret.pipeline.id) From 635e3b38927c26d84b44331985c893143e886c3b Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Tue, 2 Dec 2025 14:25:03 +0100 Subject: [PATCH 57/81] format --- .github/workflows/require-release-label.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/require-release-label.yml b/.github/workflows/require-release-label.yml index 66be8d3bd5d..9c6762789a9 100644 --- a/.github/workflows/require-release-label.yml +++ b/.github/workflows/require-release-label.yml @@ -1,12 +1,11 @@ +--- # Requires PRs to have either 'release-notes' or 'no-release-notes' label # This ensures release notes are considered for every PR before merging. # The check is enforced via branch protection rules on develop. name: Require Release Label - on: pull_request: types: [opened, labeled, unlabeled, synchronize] - jobs: check-label: if: github.repository == 'zenml-io/zenml' @@ -17,8 +16,8 @@ jobs: with: mode: exactly count: 1 - labels: "release-notes, no-release-notes" - message: | + labels: release-notes, no-release-notes + message: |- This PR is missing a release label. Please add one of: - `release-notes` - if this PR has user-facing changes that should appear in the changelog - `no-release-notes` - if this is an internal change (refactoring, tests, CI, etc.) From b8ab51ff756dac3950e3ee8525c6c1442da25abb Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Tue, 2 Dec 2025 15:36:53 +0100 Subject: [PATCH 58/81] Apply code review suggestions --- .../kubernetes_orchestrator_entrypoint.py | 8 ++- .../log_stores/artifact/artifact_log_store.py | 64 ++++++++----------- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py index 1e106d21301..bb1747e552a 100644 --- a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +++ b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py @@ -59,6 +59,7 @@ from zenml.logger import get_logger from zenml.models import ( PipelineRunResponse, + PipelineRunUpdate, PipelineSnapshotResponse, RunMetadataResource, ) @@ -263,7 +264,12 @@ def main() -> None: orchestrator_run_id = orchestrator_pod_name if args.run_id: - pipeline_run = client.get_pipeline_run(args.run_id) + pipeline_run = client.zen_store.update_run( + run_id=args.run_id, + run_update=PipelineRunUpdate( + orchestrator_run_id=orchestrator_run_id + ), + ) else: pipeline_run = create_placeholder_run( snapshot=snapshot, diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index 649b1e15e22..becf61ed257 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -88,15 +88,14 @@ def remove_ansi_escape_codes(text: str) -> str: def fetch_log_records( - zen_store: "BaseZenStore", - artifact_store_id: Union[str, UUID], + artifact_store: "BaseArtifactStore", logs_uri: str, ) -> List[LogEntry]: """Fetches log entries. Args: zen_store: The store in which the artifact is stored. - artifact_store_id: The ID of the artifact store. + artifact_store: The artifact store. logs_uri: The URI of the artifact (file or directory). Returns: @@ -104,9 +103,7 @@ def fetch_log_records( """ log_entries = [] - for line in _stream_logs_line_by_line( - zen_store, artifact_store_id, logs_uri - ): + for line in _stream_logs_line_by_line(artifact_store, logs_uri): if log_entry := parse_log_entry(line): log_entries.append(log_entry) @@ -117,8 +114,7 @@ def fetch_log_records( def _stream_logs_line_by_line( - zen_store: "BaseZenStore", - artifact_store_id: Union[str, UUID], + artifact_store: "BaseArtifactStore", logs_uri: str, ) -> Iterator[str]: """Stream logs line by line without loading the entire file into memory. @@ -127,8 +123,7 @@ def _stream_logs_line_by_line( and directories with multiple log files. Args: - zen_store: The store in which the artifact is stored. - artifact_store_id: The ID of the artifact store. + artifact_store: The artifact store. logs_uri: The URI of the log file or directory. Yields: @@ -137,33 +132,28 @@ def _stream_logs_line_by_line( Raises: DoesNotExistException: If the artifact does not exist in the artifact store. """ - artifact_store = load_artifact_store(artifact_store_id, zen_store) + if not artifact_store.isdir(logs_uri): + # Single file case + with artifact_store.open(logs_uri, "r") as file: + for line in file: + yield line.rstrip("\n\r") + else: + # Directory case - may contain multiple log files + files = artifact_store.listdir(logs_uri) + if not files: + raise DoesNotExistException( + f"Folder '{logs_uri}' is empty in artifact store " + f"'{artifact_store.name}'." + ) + + # Sort files to read them in order + files.sort() - try: - if not artifact_store.isdir(logs_uri): - # Single file case - with artifact_store.open(logs_uri, "r") as file: - for line in file: + for file in files: + file_path = os.path.join(logs_uri, str(file)) + with artifact_store.open(file_path, "r") as f: + for line in f: yield line.rstrip("\n\r") - else: - # Directory case - may contain multiple log files - files = artifact_store.listdir(logs_uri) - if not files: - raise DoesNotExistException( - f"Folder '{logs_uri}' is empty in artifact store " - f"'{artifact_store.name}'." - ) - - # Sort files to read them in order - files.sort() - - for file in files: - file_path = os.path.join(logs_uri, str(file)) - with artifact_store.open(file_path, "r") as f: - for line in f: - yield line.rstrip("\n\r") - finally: - artifact_store.cleanup() def parse_log_entry(log_line: str) -> Optional[LogEntry]: @@ -319,10 +309,8 @@ def fetch( "ArtifactLogStore.fetch(). Both parameters will be ignored." ) - client = Client() log_entries = fetch_log_records( - zen_store=client.zen_store, - artifact_store_id=logs_model.artifact_store_id, + artifact_store=self._artifact_store, logs_uri=logs_model.uri, ) From b86f42507748c9ef0ef6b2cc99c5c82ce7a32e3a Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Tue, 2 Dec 2025 16:38:57 +0100 Subject: [PATCH 59/81] Another round of code review suggestions --- src/zenml/log_stores/__init__.py | 8 -- .../log_stores/artifact/artifact_log_store.py | 19 +++-- .../artifact/artifact_log_store_flavor.py | 82 ------------------- src/zenml/log_stores/otel/otel_flavor.py | 76 +---------------- src/zenml/stack/flavor_registry.py | 2 - src/zenml/utils/logging_utils.py | 23 +++++- .../zen_stores/schemas/step_run_schemas.py | 16 ++-- src/zenml/zen_stores/sql_zen_store.py | 4 +- .../functional/zen_stores/test_zen_store.py | 6 +- 9 files changed, 47 insertions(+), 189 deletions(-) delete mode 100644 src/zenml/log_stores/artifact/artifact_log_store_flavor.py diff --git a/src/zenml/log_stores/__init__.py b/src/zenml/log_stores/__init__.py index 024eae29d88..a9d96e8c6a4 100644 --- a/src/zenml/log_stores/__init__.py +++ b/src/zenml/log_stores/__init__.py @@ -23,7 +23,6 @@ # OpenTelemetry log store from zenml.log_stores.otel.otel_flavor import ( OtelLogStoreConfig, - OtelLogStoreFlavor, ) from zenml.log_stores.otel.otel_log_store import OtelLogStore @@ -31,10 +30,6 @@ from zenml.log_stores.artifact.artifact_log_store import ( ArtifactLogStore, ) -from zenml.log_stores.artifact.artifact_log_store_flavor import ( - ArtifactLogStoreConfig, - ArtifactLogStoreFlavor, -) # Datadog log store from zenml.log_stores.datadog.datadog_flavor import ( @@ -47,8 +42,6 @@ __all__ = [ "ArtifactLogStore", - "ArtifactLogStoreConfig", - "ArtifactLogStoreFlavor", "BaseLogStore", "BaseLogStoreConfig", "BaseLogStoreFlavor", @@ -57,5 +50,4 @@ "DatadogLogStoreFlavor", "OtelLogStore", "OtelLogStoreConfig", - "OtelLogStoreFlavor", ] diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index becf61ed257..bdd17a8b5df 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -21,7 +21,6 @@ Iterator, List, Optional, - Union, cast, ) from uuid import UUID @@ -29,20 +28,15 @@ from opentelemetry.sdk._logs.export import LogExporter from zenml.artifact_stores import BaseArtifactStore -from zenml.artifacts.utils import load_artifact_store -from zenml.client import Client from zenml.enums import LoggingLevels, StackComponentType from zenml.exceptions import DoesNotExistException -from zenml.log_stores.artifact.artifact_log_store_flavor import ( - ArtifactLogStoreConfig, -) from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST +from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.log_stores.otel.otel_log_store import OtelLogStore from zenml.logger import get_logger from zenml.models import LogsResponse from zenml.utils.io_utils import sanitize_remote_path from zenml.utils.logging_utils import LogEntry -from zenml.zen_stores.base_zen_store import BaseZenStore ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") @@ -199,6 +193,10 @@ def parse_log_entry(log_line: str) -> Optional[LogEntry]: ) +class ArtifactLogStoreConfig(OtelLogStoreConfig): + """Configuration for the artifact log store.""" + + class ArtifactLogStore(OtelLogStore): """Log store that saves logs to the artifact store. @@ -315,3 +313,10 @@ def fetch( ) return log_entries[:limit] + + def cleanup(self) -> None: + """Cleanup the artifact log store. + + This method is called to ensure that the artifact log store is cleaned up. + """ + self._artifact_store.cleanup() diff --git a/src/zenml/log_stores/artifact/artifact_log_store_flavor.py b/src/zenml/log_stores/artifact/artifact_log_store_flavor.py deleted file mode 100644 index 7923626c285..00000000000 --- a/src/zenml/log_stores/artifact/artifact_log_store_flavor.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) ZenML GmbH 2025. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing -# permissions and limitations under the License. -"""Artifact log store flavor implementation.""" - -from typing import TYPE_CHECKING, Type - -from zenml.log_stores.otel.otel_flavor import ( - OtelLogStoreConfig, - OtelLogStoreFlavor, -) - -if TYPE_CHECKING: - from zenml.log_stores.base_log_store import BaseLogStore - - -class ArtifactLogStoreConfig(OtelLogStoreConfig): - """Configuration for the artifact log store.""" - - -class ArtifactLogStoreFlavor(OtelLogStoreFlavor): - """Artifact log store flavor implementation.""" - - @property - def name(self) -> str: - """Name of the flavor. - - Returns: - The name of the flavor. - """ - return "artifact" - - @property - def docs_url(self) -> str: - """URL to the flavor documentation. - - Returns: - The URL to the flavor documentation. - """ - return "https://docs.zenml.io/stack-components/log-stores/artifact" - - @property - def logo_url(self) -> str: - """URL to the flavor logo. - - Returns: - The URL to the flavor logo. - """ - # TODO: Add a logo for the artifact log store - return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/log_store/artifact.png" - - @property - def config_class(self) -> Type[ArtifactLogStoreConfig]: - """Returns `ArtifactLogStoreConfig` config class. - - Returns: - The config class. - """ - return ArtifactLogStoreConfig - - @property - def implementation_class(self) -> Type["BaseLogStore"]: - """Implementation class for this flavor. - - Returns: - The implementation class. - """ - from zenml.log_stores.artifact.artifact_log_store import ( - ArtifactLogStore, - ) - - return ArtifactLogStore diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index 18e93db48e9..f0338bd25c2 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -13,14 +13,10 @@ # permissions and limitations under the License. """OpenTelemetry log store flavor.""" -from typing import Type - from pydantic import Field from zenml import __version__ -from zenml.enums import StackComponentType -from zenml.log_stores import BaseLogStore, BaseLogStoreConfig -from zenml.stack.flavor import Flavor +from zenml.log_stores import BaseLogStoreConfig class OtelLogStoreConfig(BaseLogStoreConfig): @@ -53,73 +49,3 @@ class OtelLogStoreConfig(BaseLogStoreConfig): default=512, description="Maximum batch size for exports", ) - - -class OtelLogStoreFlavor(Flavor): - """OpenTelemetry log store flavor.""" - - @property - def name(self) -> str: - """Name of the flavor. - - Returns: - The name of the flavor. - """ - return "otel" - - @property - def docs_url(self) -> str: - """URL to the flavor documentation. - - Returns: - The URL to the flavor documentation. - """ - return "https://docs.zenml.io/stack-components/log-stores/otel" - - @property - def sdk_docs_url(self) -> str: - """URL to the SDK docs for this flavor. - - Returns: - The URL to the SDK docs for this flavor. - """ - return self.docs_url - - @property - def logo_url(self) -> str: - """URL to the flavor logo. - - Returns: - The URL to the flavor logo. - """ - # TODO: Add a logo for the OpenTelemetry log store - return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/log_store/otel.png" - - @property - def type(self) -> StackComponentType: - """Stack component type. - - Returns: - The stack component type. - """ - return StackComponentType.LOG_STORE - - @property - def config_class(self) -> Type[BaseLogStoreConfig]: - """Returns `OtelLogStoreConfig` config class. - - Returns: - The config class. - """ - return OtelLogStoreConfig - - @property - def implementation_class(self) -> Type[BaseLogStore]: - """Implementation class for this flavor. - - Returns: - The implementation class. - """ - from zenml.log_stores.otel.otel_log_store import OtelLogStore - - return OtelLogStore diff --git a/src/zenml/stack/flavor_registry.py b/src/zenml/stack/flavor_registry.py index 653071d316d..349d382fe46 100644 --- a/src/zenml/stack/flavor_registry.py +++ b/src/zenml/stack/flavor_registry.py @@ -70,7 +70,6 @@ def builtin_flavors(self) -> List[Type[Flavor]]: from zenml.deployers import DockerDeployerFlavor, LocalDeployerFlavor from zenml.image_builders import LocalImageBuilderFlavor from zenml.log_stores import ( - ArtifactLogStoreFlavor, DatadogLogStoreFlavor, ) from zenml.orchestrators import ( @@ -89,7 +88,6 @@ def builtin_flavors(self) -> List[Type[Flavor]]: GitHubContainerRegistryFlavor, LocalImageBuilderFlavor, DockerDeployerFlavor, - ArtifactLogStoreFlavor, DatadogLogStoreFlavor, LocalDeployerFlavor, ] diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 4409aa3eab3..609f7ea38c1 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -14,6 +14,7 @@ """Utility functions for logging.""" import logging +import os import threading from contextlib import nullcontext from contextvars import ContextVar @@ -22,7 +23,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Type, cast from uuid import UUID, uuid4 -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from zenml.client import Client from zenml.config.pipeline_configurations import PipelineConfiguration @@ -30,6 +31,7 @@ from zenml.constants import ( ENV_ZENML_DISABLE_PIPELINE_LOGS_STORAGE, ENV_ZENML_DISABLE_STEP_LOGS_STORAGE, + ENV_ZENML_SERVER, handle_bool_env_var, ) from zenml.enums import LoggingLevels, StackComponentType @@ -100,6 +102,11 @@ class LogEntry(BaseModel): description="The unique identifier of the log entry", ) + model_config = ConfigDict( + # ignore extra attributes during model initialization + extra="ignore", + ) + class LoggingContext: """Context manager which collects logs using a LogStore.""" @@ -384,11 +391,20 @@ def fetch_logs( Raises: DoesNotExistException: If the log store doesn't exist or is not the right type. NotImplementedError: If the log store's dependencies are not installed. + RuntimeError: If the function is called from the client environment. """ from zenml.artifacts.utils import load_artifact_store from zenml.log_stores.base_log_store import BaseLogStore from zenml.stack import StackComponent + if ENV_ZENML_SERVER not in os.environ: + # This utility function should not be called from the client environment + # because it would cause instantiating the active log store again. + raise RuntimeError( + "This utility function is only supported in the server " + "environment. Use the log store directly instead." + ) + log_store: Optional[BaseLogStore] = None if logs.log_store_id: @@ -425,4 +441,7 @@ def fetch_logs( else: return [] - return log_store.fetch(logs_model=logs, limit=limit) + try: + return log_store.fetch(logs_model=logs, limit=limit) + finally: + log_store.cleanup() diff --git a/src/zenml/zen_stores/schemas/step_run_schemas.py b/src/zenml/zen_stores/schemas/step_run_schemas.py index 918c8cc3336..c539e3b18d5 100644 --- a/src/zenml/zen_stores/schemas/step_run_schemas.py +++ b/src/zenml/zen_stores/schemas/step_run_schemas.py @@ -277,14 +277,13 @@ def get_query_options( joinedload(jl_arg(StepRunSchema.dynamic_config)), ] - if include_metadata: - options.extend( - [ - selectinload(jl_arg(StepRunSchema.logs)), - # joinedload(jl_arg(StepRunSchema.parents)), - # joinedload(jl_arg(StepRunSchema.run_metadata)), - ] - ) + # if include_metadata: + # options.extend( + # [ + # # joinedload(jl_arg(StepRunSchema.parents)), + # # joinedload(jl_arg(StepRunSchema.run_metadata)), + # ] + # ) if include_resources: options.extend( @@ -311,6 +310,7 @@ def get_query_options( .joinedload( jl_arg(ArtifactVersionSchema.artifact), innerjoin=True ), + selectinload(jl_arg(StepRunSchema.logs)), ] ) diff --git a/src/zenml/zen_stores/sql_zen_store.py b/src/zenml/zen_stores/sql_zen_store.py index 91b26c754ec..3e444f09bb7 100644 --- a/src/zenml/zen_stores/sql_zen_store.py +++ b/src/zenml/zen_stores/sql_zen_store.py @@ -166,7 +166,7 @@ SecretsStoreNotConfiguredError, ) from zenml.io import fileio -from zenml.logger import get_logger, get_logging_level, get_zenml_handler +from zenml.logger import get_console_handler, get_logger, get_logging_level from zenml.metadata.metadata_types import get_metadata_type from zenml.models import ( ActionFilter, @@ -1587,7 +1587,7 @@ def migrate_database(self) -> None: else: alembic_logger.setLevel(logging.WARNING) - alembic_logger.addHandler(get_zenml_handler()) + alembic_logger.addHandler(get_console_handler()) # We need to account for 3 distinct cases here: # 1. the database is completely empty (not initialized) diff --git a/tests/integration/functional/zen_stores/test_zen_store.py b/tests/integration/functional/zen_stores/test_zen_store.py index 70dc8a58735..dc46a424cab 100644 --- a/tests/integration/functional/zen_stores/test_zen_store.py +++ b/tests/integration/functional/zen_stores/test_zen_store.py @@ -138,7 +138,6 @@ ) from zenml.utils import code_repository_utils, source_utils from zenml.utils.enum_utils import StrEnum -from zenml.utils.logging_utils import fetch_logs from zenml.zen_stores.rest_zen_store import RestZenStore from zenml.zen_stores.sql_zen_store import SqlZenStore @@ -3239,13 +3238,14 @@ def test_artifact_fetch_works_with_invalid_name(clean_client: "Client"): def test_logs_are_recorded_properly(clean_client): """Tests if logs are stored in the artifact store.""" client = Client() - store = client.zen_store run_context = PipelineRunContext(1) with run_context: steps = run_context.steps step1_logs = steps[0].logs - step1_logs_content = fetch_logs(step1_logs, store, limit=100) + step1_logs_content = client.active_stack.log_store.fetch( + step1_logs, limit=100 + ) # Step 1 has the word log! Defined in PipelineRunContext assert any("log" in record.message for record in step1_logs_content) From dd9e23d7d89ee9dcce5d048ec25a687be190bdfe Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Wed, 3 Dec 2025 10:52:27 +0100 Subject: [PATCH 60/81] Add metadata to logs and replaced the datadog exporter with the standard OTEL exporter --- pyproject.toml | 1 + .../artifact/artifact_log_exporter.py | 2 +- .../log_stores/artifact/artifact_log_store.py | 39 +++++- src/zenml/log_stores/base_log_store.py | 4 +- .../datadog/datadog_log_exporter.py | 125 ------------------ .../log_stores/datadog/datadog_log_store.py | 31 +++-- src/zenml/log_stores/otel/otel_log_store.py | 67 +++------- src/zenml/orchestrators/step_launcher.py | 1 + src/zenml/orchestrators/step_runner.py | 1 + src/zenml/utils/logging_utils.py | 107 ++++++++++++++- 10 files changed, 183 insertions(+), 195 deletions(-) delete mode 100644 src/zenml/log_stores/datadog/datadog_log_exporter.py diff --git a/pyproject.toml b/pyproject.toml index 308aae5cbdb..a3d65ba50a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "gitpython>=3.1.18,<4.0.0", "jsonref", "opentelemetry-sdk>=1.0,<=1.38.0", + "opentelemetry-exporter-otlp-proto-http>=1.0,<=1.38.0", "packaging>=24.1", "psutil>=5.0.0", "pydantic>=2.0,<=2.11.9", diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 94475d34c2b..178261cd427 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -25,11 +25,11 @@ from zenml.artifact_stores.base_artifact_store import BaseArtifactStore from zenml.enums import LoggingLevels from zenml.log_stores.artifact.artifact_log_store import ( + ZENML_OTEL_LOG_STORE_FLUSH_KEY, remove_ansi_escape_codes, ) from zenml.log_stores.otel.otel_log_store import ( ZENML_OTEL_LOG_STORE_CONTEXT_KEY, - ZENML_OTEL_LOG_STORE_FLUSH_KEY, ) from zenml.logger import get_logger from zenml.models import LogsResponse diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index bdd17a8b5df..c476ef47051 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -25,6 +25,7 @@ ) from uuid import UUID +from opentelemetry import context as otel_context from opentelemetry.sdk._logs.export import LogExporter from zenml.artifact_stores import BaseArtifactStore @@ -32,7 +33,10 @@ from zenml.exceptions import DoesNotExistException from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig -from zenml.log_stores.otel.otel_log_store import OtelLogStore +from zenml.log_stores.otel.otel_log_store import ( + ZENML_OTEL_LOG_STORE_CONTEXT_KEY, + OtelLogStore, +) from zenml.logger import get_logger from zenml.models import LogsResponse from zenml.utils.io_utils import sanitize_remote_path @@ -44,6 +48,9 @@ LOGS_EXTENSION = ".log" +ZENML_OTEL_LOG_STORE_FLUSH_KEY = otel_context.create_key( + "zenml.log_store_flush" +) def prepare_logs_uri( @@ -263,6 +270,36 @@ def get_exporter(self) -> "LogExporter": return ArtifactLogExporter(artifact_store=self._artifact_store) + def finalize( + self, + log_model: LogsResponse, + ) -> None: + """Finalize the stream of log records associated with a log model. + + Args: + log_model: The log model to finalize. + """ + with self._lock: + if not self._provider: + return + + # Attach the log_model to OTel's context so the exporter + # can access it in the background processor thread + ctx = otel_context.set_value( + ZENML_OTEL_LOG_STORE_CONTEXT_KEY, log_model + ) + ctx = otel_context.set_value( + ZENML_OTEL_LOG_STORE_FLUSH_KEY, True, context=ctx + ) + + otel_logger = self._provider.get_logger( + "zenml.log_store.flush", + schema_url=None, + ) + otel_logger.emit( + context=ctx, + ) + def fetch( self, logs_model: "LogsResponse", diff --git a/src/zenml/log_stores/base_log_store.py b/src/zenml/log_stores/base_log_store.py index 8cfb710c3b0..1abc44b5c59 100644 --- a/src/zenml/log_stores/base_log_store.py +++ b/src/zenml/log_stores/base_log_store.py @@ -17,7 +17,7 @@ import threading from abc import abstractmethod from datetime import datetime -from typing import Any, List, Optional, Type, cast +from typing import Any, Dict, List, Optional, Type, cast from zenml.enums import StackComponentType from zenml.models import LogsResponse @@ -64,12 +64,14 @@ def emit( self, record: logging.LogRecord, log_model: LogsResponse, + metadata: Dict[str, Any], ) -> None: """Process a log record from the logging system. Args: record: The Python logging.LogRecord to process. log_model: The log model to emit the log record to. + metadata: Additional metadata to attach to the log entry. """ @abstractmethod diff --git a/src/zenml/log_stores/datadog/datadog_log_exporter.py b/src/zenml/log_stores/datadog/datadog_log_exporter.py deleted file mode 100644 index 21307578e63..00000000000 --- a/src/zenml/log_stores/datadog/datadog_log_exporter.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) ZenML GmbH 2025. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing -# permissions and limitations under the License. -"""OpenTelemetry exporter that sends logs to Datadog.""" - -from typing import Sequence - -import requests -from opentelemetry.sdk._logs import LogData -from opentelemetry.sdk._logs.export import LogExporter, LogExportResult - -from zenml.logger import get_logger - -logger = get_logger(__name__) - - -class DatadogLogExporter(LogExporter): - """Custom log exporter that sends logs to Datadog's HTTP intake API. - - This exporter transforms OpenTelemetry log records into Datadog's format - and sends them via HTTP POST without requiring the Datadog SDK. - """ - - def __init__( - self, - api_key: str, - site: str = "datadoghq.com", - ): - """Initialize the Datadog log exporter. - - Args: - api_key: Datadog API key. - site: Datadog site domain. - """ - self.endpoint = f"https://http-intake.logs.{site}/v1/input" - self.headers = { - "DD-API-KEY": api_key, - "Content-Type": "application/json", - } - - def export(self, batch: Sequence["LogData"]) -> LogExportResult: - """Export a batch of log records to Datadog. - - Args: - batch: List of LogData objects from OpenTelemetry. - - Returns: - LogExportResult indicating success or failure. - """ - logs = [] - for log_data in batch: - log_record = log_data.log_record - - resource_attrs = {} - if log_record.resource: - resource_attrs = dict(log_record.resource.attributes) - - log_attrs = {} - if log_record.attributes: - log_attrs = dict(log_record.attributes) - - all_attrs = {**resource_attrs, **log_attrs} - - log_entry = { - "message": str(log_record.body), - } - - if log_record.severity_text: - log_entry["status"] = log_record.severity_text.lower() - - if log_record.timestamp: - log_entry["timestamp"] = str( - int(log_record.timestamp / 1_000_000) - ) - - if all_attrs: - tags = [f"{k}:{v}" for k, v in all_attrs.items()] - log_entry["ddtags"] = ",".join(tags) - - logs.append(log_entry) - - try: - response = requests.post( - self.endpoint, - headers=self.headers, - json=logs, - timeout=10, - ) - - if response.status_code in [200, 202]: - logger.debug(f"Successfully sent {len(logs)} logs to Datadog") - return LogExportResult.SUCCESS - else: - logger.warning( - f"Datadog rejected logs: {response.status_code} - {response.text[:200]}" - ) - return LogExportResult.FAILURE - except Exception: - logger.exception("Failed to export logs to Datadog") - return LogExportResult.FAILURE - - def shutdown(self) -> None: - """Shutdown the exporter.""" - pass - - def force_flush(self, timeout_millis: int = 30000) -> bool: - """Force flush any buffered logs. - - Args: - timeout_millis: Timeout in milliseconds. - - Returns: - True if successful. - """ - return True diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 56bcaee0379..79dd930e93a 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -17,7 +17,9 @@ from typing import Any, Dict, List, Optional, cast import requests -from opentelemetry.sdk._logs.export import LogExporter +from opentelemetry.exporter.otlp.proto.http._log_exporter import ( + OTLPLogExporter, +) from zenml.enums import LoggingLevels from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST @@ -37,6 +39,8 @@ class DatadogLogStore(OtelLogStore): to Datadog's HTTP intake API. """ + _otlp_exporter: Optional[OTLPLogExporter] = None + @property def config(self) -> DatadogLogStoreConfig: """Returns the configuration of the Datadog log store. @@ -46,20 +50,18 @@ def config(self) -> DatadogLogStoreConfig: """ return cast(DatadogLogStoreConfig, self._config) - def get_exporter(self) -> "LogExporter": + def get_exporter(self) -> OTLPLogExporter: """Get the Datadog log exporter. Returns: DatadogLogExporter configured with API key and site. """ - from zenml.log_stores.datadog.datadog_log_exporter import ( - DatadogLogExporter, - ) - - return DatadogLogExporter( - api_key=self.config.api_key.get_secret_value(), - site=self.config.site, - ) + if not self._otlp_exporter: + self._otlp_exporter = OTLPLogExporter( + endpoint=f"https://http-intake.logs.{self.config.site}/v1/logs", + headers={"dd-api-key": self.config.api_key.get_secret_value()}, + ) + return self._otlp_exporter def fetch( self, @@ -191,3 +193,12 @@ def _parse_log_level( return LoggingLevels.CRITICAL else: return LoggingLevels.INFO + + def cleanup(self) -> None: + """Cleanup the Datadog log store. + + This method is called when the log store is no longer needed. + """ + if self._otlp_exporter: + self._otlp_exporter.shutdown() # type: ignore[no-untyped-call] + self._otlp_exporter = None diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 38bdbc68f75..040cc8280d9 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -16,11 +16,11 @@ import logging from abc import abstractmethod from datetime import datetime -from typing import TYPE_CHECKING, Any, List, Optional, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast from opentelemetry import context as otel_context -from opentelemetry._logs.severity import SeverityNumber from opentelemetry.sdk._logs import LoggerProvider +from opentelemetry.sdk._logs._internal import std_to_otel from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource @@ -42,9 +42,6 @@ ZENML_OTEL_LOG_STORE_CONTEXT_KEY = otel_context.create_key( "zenml.logging_context" ) -ZENML_OTEL_LOG_STORE_FLUSH_KEY = otel_context.create_key( - "zenml.log_store_flush" -) class OtelLogStore(BaseLogStore): @@ -107,12 +104,14 @@ def emit( self, record: logging.LogRecord, log_model: "LogsResponse", + metadata: Dict[str, Any], ) -> None: """Process a log record by sending to OpenTelemetry. Args: record: The log record to process. log_model: The log model to emit the log record to. + metadata: Additional metadata to attach to the log entry. Raises: RuntimeError: If the OpenTelemetry provider is not initialized. @@ -147,18 +146,25 @@ def emit( else: message = exc_text + zenml_log_metadata = { + f"zenml.{key}": value for key, value in metadata.items() + } + otel_logger.emit( timestamp=int(record.created * 1e9), observed_timestamp=int(record.created * 1e9), - severity_number=self._get_severity_number(record.levelno), - severity_text=record.levelname, + severity_number=std_to_otel(record.levelno), + severity_text="WARN" + if record.levelname == "WARNING" + else record.levelname, body=message, attributes={ "code.filepath": record.pathname, "code.lineno": record.lineno, "code.function": record.funcName, - "log_id": str(log_model.id), - "log_store_id": str(self.id), + "zenml.log_id": str(log_model.id), + "zenml.log_store_id": str(self.id), + **zenml_log_metadata, }, context=ctx, ) @@ -172,26 +178,7 @@ def finalize( Args: log_model: The log model to finalize. """ - with self._lock: - if not self._provider: - return - - # Attach the log_model to OTel's context so the exporter - # can access it in the background processor thread - ctx = otel_context.set_value( - ZENML_OTEL_LOG_STORE_CONTEXT_KEY, log_model - ) - ctx = otel_context.set_value( - ZENML_OTEL_LOG_STORE_FLUSH_KEY, True, context=ctx - ) - - otel_logger = self._provider.get_logger( - "zenml.log_store.flush", - schema_url=None, - ) - otel_logger.emit( - context=ctx, - ) + pass def flush(self) -> None: """Flush the log store. @@ -201,28 +188,6 @@ def flush(self) -> None: if self._processor: self._processor.force_flush() - def _get_severity_number(self, levelno: int) -> SeverityNumber: - """Map Python log level to OTEL severity number. - - Args: - levelno: Python logging level number. - - Returns: - OTEL severity number. - """ - if levelno >= logging.CRITICAL: - return SeverityNumber.FATAL - elif levelno >= logging.ERROR: - return SeverityNumber.ERROR - elif levelno >= logging.WARNING: - return SeverityNumber.WARN - elif levelno >= logging.INFO: - return SeverityNumber.INFO - elif levelno >= logging.DEBUG: - return SeverityNumber.DEBUG - else: - return SeverityNumber.UNSPECIFIED - def deactivate(self) -> None: """Deactivate log collection and shut down the processor. diff --git a/src/zenml/orchestrators/step_launcher.py b/src/zenml/orchestrators/step_launcher.py index 8252bcbb18a..bbf11b6cadd 100644 --- a/src/zenml/orchestrators/step_launcher.py +++ b/src/zenml/orchestrators/step_launcher.py @@ -310,6 +310,7 @@ def launch(self) -> StepRunResponse: ): logs_context = setup_step_logging( step_run=step_run, + pipeline_run=pipeline_run, source="prepare_step", ) diff --git a/src/zenml/orchestrators/step_runner.py b/src/zenml/orchestrators/step_runner.py index e1cc1b3949c..b5ff2acc583 100644 --- a/src/zenml/orchestrators/step_runner.py +++ b/src/zenml/orchestrators/step_runner.py @@ -148,6 +148,7 @@ def run( if is_step_logging_enabled(step_run.config, pipeline_run.config): logs_context = setup_step_logging( step_run=step_run, + pipeline_run=pipeline_run, source="step", ) diff --git a/src/zenml/utils/logging_utils.py b/src/zenml/utils/logging_utils.py index 609f7ea38c1..842c7e1c2a0 100644 --- a/src/zenml/utils/logging_utils.py +++ b/src/zenml/utils/logging_utils.py @@ -20,7 +20,7 @@ from contextvars import ContextVar from datetime import datetime from types import TracebackType -from typing import TYPE_CHECKING, Any, List, Optional, Type, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, cast from uuid import UUID, uuid4 from pydantic import BaseModel, ConfigDict, Field @@ -114,17 +114,20 @@ class LoggingContext: def __init__( self, log_model: "LogsResponse", + **metadata: Any, ) -> None: """Initialize the logging context. Args: log_model: The logs response model for this context. + **metadata: Additional metadata to attach to the log entry. """ self.log_model = log_model self._lock = threading.Lock() self._previous_context: Optional[LoggingContext] = None self._disabled = False self._log_store = Client().active_stack.log_store + self._metadata = metadata @classmethod def emit(cls, record: logging.LogRecord) -> None: @@ -143,7 +146,9 @@ def emit(cls, record: logging.LogRecord) -> None: try: message = record.getMessage() if message and message.strip(): - context._log_store.emit(record, context.log_model) + context._log_store.emit( + record, context.log_model, context._metadata + ) except Exception: logger.debug("Failed to emit log record", exc_info=True) finally: @@ -293,6 +298,51 @@ def search_logs_by_source( return None +def get_run_log_metadata( + pipeline_run: "PipelineRunResponse", +) -> Dict[str, Any]: + """Get the log metadata for a pipeline run. + + Args: + pipeline_run: The pipeline run. + + Returns: + The log metadata. + """ + log_metadata = dict( + pipeline_run_id=str(pipeline_run.id), + pipeline_run_name=pipeline_run.name, + project_id=str(pipeline_run.project.id), + project_name=pipeline_run.project.name, + ) + + if pipeline_run.pipeline is not None: + log_metadata.update( + dict( + pipeline_id=str(pipeline_run.pipeline.id), + pipeline_name=pipeline_run.pipeline.name, + ) + ) + + if pipeline_run.stack is not None: + log_metadata.update( + dict( + stack_id=str(pipeline_run.stack.id), + stack_name=pipeline_run.stack.name, + ) + ) + + if pipeline_run.user is not None: + log_metadata.update( + dict( + user_id=str(pipeline_run.user.id), + user_name=pipeline_run.user.name, + ) + ) + + return log_metadata + + def setup_run_logging( pipeline_run: "PipelineRunResponse", source: str, @@ -308,11 +358,17 @@ def setup_run_logging( Returns: The logs context. """ + log_metadata = get_run_log_metadata(pipeline_run) + log_metadata.update(dict(source=source)) + if pipeline_run.log_collection is not None: if run_logs := search_logs_by_source( pipeline_run.log_collection, source ): - return LoggingContext(log_model=run_logs) + return LoggingContext( + log_model=run_logs, + **log_metadata, + ) logs_request = generate_logs_request(source=source) try: @@ -328,13 +384,39 @@ def setup_run_logging( if run_logs := search_logs_by_source( pipeline_run.log_collection, source ): - return LoggingContext(log_model=run_logs) + return LoggingContext( + log_model=run_logs, + **log_metadata, + ) return nullcontext() +def get_step_log_metadata( + step_run: "StepRunResponse", pipeline_run: "PipelineRunResponse" +) -> Dict[str, Any]: + """Get the log metadata for a step run. + + Args: + step_run: The step run. + pipeline_run: The pipeline run. + + Returns: + The log metadata. + """ + log_metadata = get_run_log_metadata(pipeline_run) + log_metadata.update( + dict( + step_run_id=str(step_run.id), + step_run_name=step_run.name, + ) + ) + return log_metadata + + def setup_step_logging( step_run: "StepRunResponse", + pipeline_run: "PipelineRunResponse", source: str, ) -> Any: """Set up logging for a step run. @@ -343,14 +425,27 @@ def setup_step_logging( Args: step_run: The step run. + pipeline_run: The pipeline run. source: The source of the logs. Returns: The logs context. """ + log_metadata = get_step_log_metadata(step_run, pipeline_run) + log_metadata.update(dict(source=source)) + + if pipeline_run.log_collection is not None: + if run_logs := search_logs_by_source( + pipeline_run.log_collection, source + ): + return LoggingContext( + log_model=run_logs, + **log_metadata, + ) + if step_run.log_collection is not None: if step_logs := search_logs_by_source(step_run.log_collection, source): - return LoggingContext(log_model=step_logs) + return LoggingContext(log_model=step_logs, **log_metadata) logs_request = generate_logs_request(source=source) try: @@ -364,7 +459,7 @@ def setup_step_logging( if step_run.log_collection is not None: if step_logs := search_logs_by_source(step_run.log_collection, source): - return LoggingContext(log_model=step_logs) + return LoggingContext(log_model=step_logs, **log_metadata) return nullcontext() From 255224f4bc87dcfbd9bea6630464caea6f1359fb Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Wed, 3 Dec 2025 11:19:32 +0100 Subject: [PATCH 61/81] Fixed datadog log fetching --- src/zenml/log_stores/datadog/datadog_flavor.py | 4 ++++ src/zenml/log_stores/datadog/datadog_log_store.py | 9 ++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index eea2f975acf..c9b7a1677e6 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -29,12 +29,16 @@ class DatadogLogStoreConfig(OtelLogStoreConfig): Attributes: api_key: Datadog API key for log ingestion. + application_key: Datadog application key for log extraction. site: Datadog site (e.g., "datadoghq.com", "datadoghq.eu"). """ api_key: PlainSerializedSecretStr = Field( description="Datadog API key for log ingestion", ) + application_key: PlainSerializedSecretStr = Field( + description="Datadog application key for log extraction", + ) site: str = Field( default="datadoghq.com", description="Datadog site (e.g., datadoghq.com, datadoghq.eu)", diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 79dd930e93a..0d237f458eb 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -88,15 +88,9 @@ def fetch( # Build query query_parts = [ f"service:{self.config.service_name}", - f"zenml.pipeline_run_id:{logs_model.pipeline_run_id}", + f"@zenml.log_id:{logs_model.id}", ] - if logs_model.step_run_id: - query_parts.append(f"zenml.step_id:{logs_model.step_run_id}") - - if logs_model.source: - query_parts.append(f"zenml.source:{logs_model.source}") - query = " ".join(query_parts) # Build API request @@ -105,6 +99,7 @@ def fetch( ) headers = { "DD-API-KEY": self.config.api_key.get_secret_value(), + "DD-APPLICATION-KEY": self.config.application_key.get_secret_value(), "Content-Type": "application/json", } From 6fee1ebb59a997fed8ebcf9d71302f0720377c9e Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Wed, 3 Dec 2025 13:50:34 +0100 Subject: [PATCH 62/81] docstrings --- src/zenml/log_stores/artifact/artifact_log_store.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index c476ef47051..edfae1aaeb8 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -95,7 +95,6 @@ def fetch_log_records( """Fetches log entries. Args: - zen_store: The store in which the artifact is stored. artifact_store: The artifact store. logs_uri: The URI of the artifact (file or directory). From 04d18d020389a6ca6fa8022d723da07e84c857b8 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Wed, 3 Dec 2025 15:02:46 +0100 Subject: [PATCH 63/81] Update src/zenml/log_stores/artifact/artifact_log_exporter.py --- src/zenml/log_stores/artifact/artifact_log_exporter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 178261cd427..45509ed3d28 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -82,12 +82,12 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: context=log_data.log_record.context, ), ) + if not log_model: + continue flush = otel_context.get_value( key=ZENML_OTEL_LOG_STORE_FLUSH_KEY, context=log_data.log_record.context, ) - if not log_model: - continue if flush: finalized_log_streams.append(log_model) From 03780822c18255c499c78625edc583f04a12d7d6 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Wed, 3 Dec 2025 21:52:35 +0100 Subject: [PATCH 64/81] Removed context, fixed datadog fetch time window, used OTEL handler --- .../artifact/artifact_log_exporter.py | 169 ++++++++---------- .../log_stores/artifact/artifact_log_store.py | 29 +-- .../log_stores/datadog/datadog_log_store.py | 33 ++-- src/zenml/log_stores/otel/otel_log_store.py | 70 +++----- 4 files changed, 127 insertions(+), 174 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 45509ed3d28..7a363703053 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -16,23 +16,18 @@ import os import time from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, cast -from uuid import UUID, uuid4 +from typing import TYPE_CHECKING, Dict, List, Sequence +from uuid import uuid4 -from opentelemetry import context as otel_context from opentelemetry.sdk._logs.export import LogExporter, LogExportResult from zenml.artifact_stores.base_artifact_store import BaseArtifactStore from zenml.enums import LoggingLevels from zenml.log_stores.artifact.artifact_log_store import ( - ZENML_OTEL_LOG_STORE_FLUSH_KEY, + END_OF_STREAM_MESSAGE, remove_ansi_escape_codes, ) -from zenml.log_stores.otel.otel_log_store import ( - ZENML_OTEL_LOG_STORE_CONTEXT_KEY, -) from zenml.logger import get_logger -from zenml.models import LogsResponse from zenml.utils.logging_utils import LogEntry from zenml.utils.time_utils import utc_now @@ -70,43 +65,32 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: return LogExportResult.SUCCESS try: - entries_by_id: Dict[UUID, List[str]] = defaultdict(list) - responses_by_id: Dict[UUID, "LogsResponse"] = {} - finalized_log_streams: List["LogsResponse"] = [] + logs_by_uri: Dict[str, List[str]] = defaultdict(list) + finalized_log_streams: List[str] = [] for log_data in batch: - log_model = cast( - Optional["LogsResponse"], - otel_context.get_value( - key=ZENML_OTEL_LOG_STORE_CONTEXT_KEY, - context=log_data.log_record.context, - ), - ) - if not log_model: + attrs = log_data.log_record.attributes + if not attrs: continue - flush = otel_context.get_value( - key=ZENML_OTEL_LOG_STORE_FLUSH_KEY, - context=log_data.log_record.context, - ) - - if flush: - finalized_log_streams.append(log_model) + log_uri = attrs.get("zenml.log_uri") + if not log_uri or not isinstance(log_uri, str): continue - responses_by_id[log_model.id] = log_model + if log_data.log_record.body is END_OF_STREAM_MESSAGE: + finalized_log_streams.append(log_uri) + continue entries = self._otel_record_to_log_entries(log_data) for entry in entries: json_line = entry.model_dump_json(exclude_none=True) - entries_by_id[log_model.id].append(json_line) + logs_by_uri[log_uri].append(json_line) - for log_id, log_lines in entries_by_id.items(): + for log_uri, log_lines in logs_by_uri.items(): if log_lines: - log_model = responses_by_id[log_id] - self._write(log_lines, log_model) + self._write(log_lines, log_uri) - for log_model in finalized_log_streams: - self._finalize(log_model) + for log_uri in finalized_log_streams: + self._finalize(log_uri) return LogExportResult.SUCCESS @@ -127,6 +111,14 @@ def _otel_record_to_log_entries( """ log_record = log_data.log_record message = str(log_record.body) if log_record.body else "" + if log_record.attributes and log_record.attributes.get( + "exception.message" + ): + exc_message = log_record.attributes.get("exception.message") + exc_type = log_record.attributes.get("exception.type") + exc_stacktrace = log_record.attributes.get("exception.stacktrace") + message += f"\n{exc_type}: {exc_message}\n{exc_stacktrace}" + message = remove_ansi_escape_codes(message).rstrip() level = ( @@ -246,131 +238,110 @@ def _split_to_chunks(self, message: str) -> List[str]: def _write( self, log_lines: List[str], - log_model: "LogsResponse", + log_uri: str, ) -> None: """Write log lines to the artifact store. Args: log_lines: List of JSON-serialized log entries. - log_model: The log model. + log_uri: The URI of the log files to write. Raises: Exception: If the log lines cannot be written to the artifact store. """ - if not log_model.uri or not log_model.artifact_store_id: - logger.warning( - f"Skipping log write: missing uri or artifact_store_id for log {log_model.id}" - ) - return - try: content = "\n".join(log_lines) + "\n" if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - if not self.artifact_store.exists(log_model.uri): - self.artifact_store.makedirs(log_model.uri) + if not self.artifact_store.exists(log_uri): + self.artifact_store.makedirs(log_uri) timestamp = time.time() file_uri = os.path.join( - log_model.uri, + log_uri, f"{timestamp}{LOGS_EXTENSION}", ) with self.artifact_store.open(file_uri, "w") as f: f.write(content) else: - logs_base_uri = os.path.dirname(log_model.uri) + logs_base_uri = os.path.dirname(log_uri) if not self.artifact_store.exists(logs_base_uri): self.artifact_store.makedirs(logs_base_uri) - with self.artifact_store.open(log_model.uri, "a") as f: + with self.artifact_store.open(log_uri, "a") as f: f.write(content) except Exception as e: - logger.error(f"Failed to write logs to {log_model.uri}: {e}") + logger.error(f"Failed to write logs to {log_uri}: {e}") raise def _finalize( self, - log_model: "LogsResponse", + log_uri: str, ) -> None: """Finalize the logs for a given log model by merging all log files into one. Args: - log_model: The log model. + log_uri: The URI of the log files to finalize. Raises: Exception: If the logs cannot be finalized. """ - if not log_model.uri or not log_model.artifact_store_id: - logger.warning( - f"Skipping log finalize: missing uri or artifact_store_id for log {log_model.id}" - ) - return - try: if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - self._merge(log_model) + self._merge(log_uri) else: - self.artifact_store._remove_previous_file_versions( - log_model.uri - ) + self.artifact_store._remove_previous_file_versions(log_uri) except Exception as e: - logger.error(f"Failed to finalize logs for {log_model.uri}: {e}") + logger.error(f"Failed to finalize logs for {log_uri}: {e}") raise - def _merge(self, log_model: "LogsResponse") -> None: + def _merge(self, log_uri: str) -> None: """Merges all log files into one in the given URI. Called on the logging context exit. Args: - log_model: The log model. + log_uri: The URI of the log files to merge. Raises: RuntimeError: If the log model has no URI, cannot merge logs. """ - # If the artifact store is immutable, merge the log files - if self.artifact_store.config.IS_IMMUTABLE_FILESYSTEM: - from zenml.artifacts.utils import _load_file_from_artifact_store - from zenml.exceptions import DoesNotExistException - - if not log_model.uri: - raise RuntimeError("Log model has no URI, cannot merge logs.") - - files_ = self.artifact_store.listdir(log_model.uri) - if len(files_) > 1: - files_.sort() - - missing_files = set() - # dump all logs to a local file first - with self.artifact_store.open( - os.path.join( - log_model.uri, f"{time.time()}_merged{LOGS_EXTENSION}" - ), - "w", - ) as merged_file: - for file in files_: - try: - merged_file.write( - str( - _load_file_from_artifact_store( - os.path.join(log_model.uri, str(file)), - artifact_store=self.artifact_store, - mode="r", - ) + from zenml.artifacts.utils import _load_file_from_artifact_store + from zenml.exceptions import DoesNotExistException + + files_ = self.artifact_store.listdir(log_uri) + if len(files_) > 1: + files_.sort() + + missing_files = set() + # dump all logs to a local file first + with self.artifact_store.open( + os.path.join(log_uri, f"{time.time()}_merged{LOGS_EXTENSION}"), + "w", + ) as merged_file: + for file in files_: + try: + merged_file.write( + str( + _load_file_from_artifact_store( + os.path.join(log_uri, str(file)), + artifact_store=self.artifact_store, + mode="r", ) ) - except DoesNotExistException: - missing_files.add(file) - - # clean up left over files - for file in files_: - if file not in missing_files: - self.artifact_store.remove( - os.path.join(log_model.uri, str(file)) ) + except DoesNotExistException: + missing_files.add(file) + + # clean up left over files + for file in files_: + if file not in missing_files: + self.artifact_store.remove( + os.path.join(log_uri, str(file)) + ) def shutdown(self) -> None: """Shutdown the exporter.""" diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index edfae1aaeb8..2d030913139 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -25,7 +25,6 @@ ) from uuid import UUID -from opentelemetry import context as otel_context from opentelemetry.sdk._logs.export import LogExporter from zenml.artifact_stores import BaseArtifactStore @@ -34,7 +33,6 @@ from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig from zenml.log_stores.otel.otel_log_store import ( - ZENML_OTEL_LOG_STORE_CONTEXT_KEY, OtelLogStore, ) from zenml.logger import get_logger @@ -48,9 +46,7 @@ LOGS_EXTENSION = ".log" -ZENML_OTEL_LOG_STORE_FLUSH_KEY = otel_context.create_key( - "zenml.log_store_flush" -) +END_OF_STREAM_MESSAGE = "END_OF_STREAM" def prepare_logs_uri( @@ -279,24 +275,15 @@ def finalize( log_model: The log model to finalize. """ with self._lock: - if not self._provider: + if not self._provider or self._logger is None: return - # Attach the log_model to OTel's context so the exporter - # can access it in the background processor thread - ctx = otel_context.set_value( - ZENML_OTEL_LOG_STORE_CONTEXT_KEY, log_model - ) - ctx = otel_context.set_value( - ZENML_OTEL_LOG_STORE_FLUSH_KEY, True, context=ctx - ) - - otel_logger = self._provider.get_logger( - "zenml.log_store.flush", - schema_url=None, - ) - otel_logger.emit( - context=ctx, + self._logger.emit( + body=END_OF_STREAM_MESSAGE, + attributes={ + "zenml.log_id": str(log_model.id), + "zenml.log_uri": str(log_model.uri), + }, ) def fetch( diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 0d237f458eb..85b25256463 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -106,6 +106,16 @@ def fetch( body: Dict[str, Any] = { "filter": { "query": query, + "from": ( + start_time.isoformat() + if start_time + else logs_model.created.isoformat() + ), + "to": ( + end_time.isoformat() + if end_time + else datetime.now().astimezone().isoformat() + ), }, "page": { "limit": min(limit, 1000), # Datadog API limit @@ -113,12 +123,6 @@ def fetch( "sort": "timestamp", } - # Add time filters if provided - if start_time: - body["filter"]["from"] = start_time.isoformat() - if end_time: - body["filter"]["to"] = end_time.isoformat() - try: response = requests.post( api_endpoint, @@ -137,16 +141,23 @@ def fetch( log_entries = [] for log in data.get("data", []): - attributes = log.get("attributes", {}) + log_fields = log.get("attributes", {}) + message = log_fields.get("message", "") + attributes = log_fields.get("attributes", {}) + if exc_info := attributes.get("exception"): + exc_message = exc_info.get("message") + exc_type = exc_info.get("type") + exc_stacktrace = exc_info.get("stacktrace") + message += f"\n{exc_type}: {exc_message}\n{exc_stacktrace}" # Parse log entry entry = LogEntry( - message=attributes.get("message", ""), - level=self._parse_log_level(attributes.get("status")), + message=message, + level=self._parse_log_level(log_fields.get("status")), timestamp=datetime.fromisoformat( - attributes["timestamp"].replace("Z", "+00:00") + log_fields["timestamp"].replace("Z", "+00:00") ) - if "timestamp" in attributes + if "timestamp" in log_fields else None, ) diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 040cc8280d9..8deb8cdf6ac 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -18,9 +18,11 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast -from opentelemetry import context as otel_context -from opentelemetry.sdk._logs import LoggerProvider -from opentelemetry.sdk._logs._internal import std_to_otel +from opentelemetry.sdk._logs import ( + Logger, + LoggerProvider, + LoggingHandler, +) from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import Resource @@ -39,10 +41,6 @@ logger = get_logger(__name__) -ZENML_OTEL_LOG_STORE_CONTEXT_KEY = otel_context.create_key( - "zenml.logging_context" -) - class OtelLogStore(BaseLogStore): """Log store that exports logs using OpenTelemetry. @@ -64,6 +62,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._exporter: Optional["LogExporter"] = None self._provider: Optional["LoggerProvider"] = None self._processor: Optional["BatchLogRecordProcessor"] = None + self._logger: Optional["Logger"] = None + self._handler: Optional["LoggingHandler"] = None @property def config(self) -> OtelLogStoreConfig: @@ -100,6 +100,11 @@ def activate(self) -> None: self._provider = LoggerProvider(resource=self._resource) self._provider.add_log_record_processor(self._processor) + self._logger = self._provider.get_logger( + "zenml.log_store.emit", + ) + self._handler = LoggingHandler(logger_provider=self._provider) + def emit( self, record: logging.LogRecord, @@ -120,55 +125,34 @@ def emit( if not self._provider: self.activate() - if self._provider is None: + if ( + self._provider is None + or self._logger is None + or self._handler is None + ): raise RuntimeError("OpenTelemetry provider is not initialized") - # Attach the log_model to OTel's context so the exporter - # can access it in the background processor thread - ctx = otel_context.set_value( - ZENML_OTEL_LOG_STORE_CONTEXT_KEY, log_model - ) + emit_kwargs = self._handler._translate(record) - otel_logger = self._provider.get_logger( - record.name or "unknown", - schema_url=None, - ) - - # Get the message and append formatted exception if present - message = record.getMessage() - if record.exc_info: - import traceback - - exc_text = "".join(traceback.format_exception(*record.exc_info)) - # Append to message with separator if message exists - if message: - message = f"{message}\n{exc_text}" - else: - message = exc_text + attributes = emit_kwargs.get("attributes", {}) zenml_log_metadata = { f"zenml.{key}": value for key, value in metadata.items() } - otel_logger.emit( - timestamp=int(record.created * 1e9), - observed_timestamp=int(record.created * 1e9), - severity_number=std_to_otel(record.levelno), - severity_text="WARN" - if record.levelname == "WARNING" - else record.levelname, - body=message, - attributes={ - "code.filepath": record.pathname, - "code.lineno": record.lineno, - "code.function": record.funcName, + attributes.update( + { "zenml.log_id": str(log_model.id), "zenml.log_store_id": str(self.id), **zenml_log_metadata, - }, - context=ctx, + } ) + if log_model.uri: + attributes["zenml.log_uri"] = log_model.uri + + self._logger.emit(**emit_kwargs) + def finalize( self, log_model: LogsResponse, From 523a620aaae940dabcbb3d2e4a4288e0189de478 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Thu, 4 Dec 2025 11:11:08 +0100 Subject: [PATCH 65/81] Implement generic OTEL exporter --- pyproject.toml | 3 +- .../log_stores/datadog/datadog_log_store.py | 8 +- .../log_stores/otel/otel_log_exporter.py | 390 ++++++++++++++++++ 3 files changed, 394 insertions(+), 7 deletions(-) create mode 100644 src/zenml/log_stores/otel/otel_log_exporter.py diff --git a/pyproject.toml b/pyproject.toml index a3d65ba50a7..e2b232f927d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,8 +38,7 @@ dependencies = [ "docker~=7.1.0", "gitpython>=3.1.18,<4.0.0", "jsonref", - "opentelemetry-sdk>=1.0,<=1.38.0", - "opentelemetry-exporter-otlp-proto-http>=1.0,<=1.38.0", + "opentelemetry-sdk==1.38.0", "packaging>=24.1", "psutil>=5.0.0", "pydantic>=2.0,<=2.11.9", diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 85b25256463..6914af9044b 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -17,13 +17,11 @@ from typing import Any, Dict, List, Optional, cast import requests -from opentelemetry.exporter.otlp.proto.http._log_exporter import ( - OTLPLogExporter, -) from zenml.enums import LoggingLevels from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST from zenml.log_stores.datadog.datadog_flavor import DatadogLogStoreConfig +from zenml.log_stores.otel.otel_log_exporter import OTLPLogExporter from zenml.log_stores.otel.otel_log_store import OtelLogStore from zenml.logger import get_logger from zenml.models import LogsResponse @@ -54,7 +52,7 @@ def get_exporter(self) -> OTLPLogExporter: """Get the Datadog log exporter. Returns: - DatadogLogExporter configured with API key and site. + OTLPLogExporter configured with API key and site. """ if not self._otlp_exporter: self._otlp_exporter = OTLPLogExporter( @@ -206,5 +204,5 @@ def cleanup(self) -> None: This method is called when the log store is no longer needed. """ if self._otlp_exporter: - self._otlp_exporter.shutdown() # type: ignore[no-untyped-call] + self._otlp_exporter.shutdown() self._otlp_exporter = None diff --git a/src/zenml/log_stores/otel/otel_log_exporter.py b/src/zenml/log_stores/otel/otel_log_exporter.py new file mode 100644 index 00000000000..81503899d48 --- /dev/null +++ b/src/zenml/log_stores/otel/otel_log_exporter.py @@ -0,0 +1,390 @@ +# Copyright (c) ZenML GmbH 2025. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing +# permissions and limitations under the License. +"""OpenTelemetry exporter that writes logs to any OpenTelemetry backend.""" + +import json +import threading +from collections import defaultdict +from enum import StrEnum +from io import BytesIO +from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Sequence + +import requests +from opentelemetry.sdk._logs.export import LogExporter, LogExportResult +from opentelemetry.sdk.util.instrumentation import InstrumentationScope +from requests.adapters import HTTPAdapter +from urllib3 import Retry + +from zenml import __version__ as zenml_version +from zenml.logger import get_logger +from zenml.utils.json_utils import pydantic_encoder + +DEFAULT_TIMEOUT = 10 + +if TYPE_CHECKING: + from opentelemetry.sdk._logs import LogData + +logger = get_logger(__name__) + + +class Compression(StrEnum): + """Compression types.""" + + NoCompression = "none" + Deflate = "deflate" + Gzip = "gzip" + + +class OTLPLogExporter(LogExporter): + """OpenTelemetry exporter using JSON protocol. + + This exporter is a placeholder until the actual implementation of the + OpenTelemetry exporter is available in the opentelemetry-exporter-otlp-proto-json package. + """ + + def __init__( + self, + endpoint: str, + certificate_file: Optional[str] = None, + client_key_file: Optional[str] = None, + client_certificate_file: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + timeout: float = DEFAULT_TIMEOUT, + compression: Compression = Compression.NoCompression, + ): + """Initialize the exporter. + + Args: + endpoint: The endpoint to export logs to. + certificate_file: The certificate file to use for the export. + client_key_file: The client key file to use for the export. + client_certificate_file: The client certificate file to use for the export. + headers: The headers to use for the export. + timeout: The timeout to use for the export. + compression: The compression to use for the export. + """ + self._shutdown_is_occuring = threading.Event() + self._endpoint = endpoint + self._certificate_file = certificate_file + self._client_key_file = client_key_file + self._client_certificate_file = client_certificate_file + self._client_cert = ( + (self._client_certificate_file, self._client_key_file) + if self._client_certificate_file and self._client_key_file + else self._client_certificate_file + ) + self._timeout = timeout + self._compression = compression + self._session = requests.Session() + self._session.headers.update( + { + "Content-Type": "application/json", + "Accept": "application/json", + "User-Agent": f"zenml/{zenml_version}", + } + ) + if headers: + self._session.headers.update(headers) + + # Retries are triggered on specific HTTP status codes: + # + # 408: Request Timeout. + # 429: Too Many Requests. + # 502: Bad Gateway. + # 503: Service Unavailable. + # 504: Gateway Timeout + # + # This also handles connection level errors, if a connection attempt + # fails due to transient issues like: + # + # DNS resolution errors. + # Connection timeouts. + # Network disruptions. + # + # Additional errors retried: + # + # Read Timeouts: If the server does not send a response within + # the timeout period. + # Connection Refused: If the server refuses the connection. + # + retries = Retry( + connect=5, + read=5, + redirect=3, + status=5, + allowed_methods=[ + "POST", + ], + status_forcelist=[ + 408, # Request Timeout + 429, # Too Many Requests + 500, # Internal Server Error + 502, # Bad Gateway + 503, # Service Unavailable + 504, # Gateway Timeout + ], + other=3, + backoff_factor=0.5, + respect_retry_after_header=True, + raise_on_status=False, + ) + http_adapter = HTTPAdapter( + max_retries=retries, + pool_maxsize=1, + ) + self._session.mount("https://", http_adapter) + self._session.mount("http://", http_adapter) + + self._shutdown = False + + def _export( + self, serialized_data: bytes, timeout_sec: float + ) -> requests.Response: + """Export a batch of logs to the OpenTelemetry backend. + + Args: + serialized_data: The serialized data to export. + timeout_sec: The timeout to use for the export. + + Returns: + The response from the export. + """ + data = serialized_data + if self._compression == Compression.Gzip: + try: + import gzip + except ImportError: + logger.warning( + "gzip module not found, compression not supported" + ) + else: + gzip_data = BytesIO() + with gzip.GzipFile(fileobj=gzip_data, mode="w") as gzip_stream: + gzip_stream.write(serialized_data) + data = gzip_data.getvalue() + self._session.headers.update( + {"Content-Encoding": self._compression.value} + ) + elif self._compression == Compression.Deflate: + try: + import zlib + except ImportError: + logger.warning( + "zlib module not found, compression not supported" + ) + else: + data = zlib.compress(serialized_data) + self._session.headers.update( + {"Content-Encoding": self._compression.value} + ) + + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=timeout_sec, + cert=self._client_cert, + ) + + return resp + + @classmethod + def _encode_value(cls, value: Any, allow_null: bool = False) -> Any: + if allow_null is True and value is None: + return None + if isinstance(value, bool): + return dict(bool_value=value) + if isinstance(value, str): + return dict(string_value=value) + if isinstance(value, int): + return dict(int_value=value) + if isinstance(value, float): + return dict(double_value=value) + if isinstance(value, bytes): + return dict(bytes_value=value) + if isinstance(value, Sequence): + return dict( + array_value=dict( + values=[ + cls._encode_value(v, allow_null=allow_null) + for v in value + ] + ) + ) + elif isinstance(value, Mapping): + return dict( + kvlist_value=dict( + values=[ + { + str(k): cls._encode_value(v, allow_null=allow_null) + for k, v in value.items() + } + ] + ) + ) + raise ValueError(f"Invalid type {type(value)} of value {value}") + + @classmethod + def _encode_attributes(cls, attributes: Mapping[str, Any]) -> Any: + return [ + dict(key=k, value=cls._encode_value(v, allow_null=True)) + for k, v in attributes.items() + ] + + @classmethod + def _encode_log(cls, log_data: "LogData") -> Dict[str, Any]: + """Encode a log data object to a dictionary. + + Args: + log_data: The log data object to encode. + + Returns: + A dictionary representing the log data. + """ + span_id = ( + None + if log_data.log_record.span_id == 0 + else log_data.log_record.span_id + ) + trace_id = ( + None + if log_data.log_record.trace_id == 0 + else log_data.log_record.trace_id + ) + body = log_data.log_record.body + log_record = dict( + time_unix_nano=log_data.log_record.timestamp, + observed_time_unix_nano=log_data.log_record.observed_timestamp, + span_id=span_id, + trace_id=trace_id, + flags=int(log_data.log_record.trace_flags), + body=cls._encode_value(body, allow_null=True), + severity_text=log_data.log_record.severity_text, + attributes=cls._encode_attributes(log_data.log_record.attributes) + if log_data.log_record.attributes + else None, + dropped_attributes_count=log_data.log_record.dropped_attributes, + severity_number=getattr( + log_data.log_record.severity_number, "value", None + ), + event_name=log_data.log_record.event_name, + ) + + return {k: v for k, v in log_record.items() if v is not None} + + def _encode_logs(self, logs: Sequence["LogData"]) -> Dict[str, Any]: + """Encode a sequence of log data objects to a list of dictionaries. + + Args: + logs: The sequence of log data objects to encode. + + Returns: + A dictionary representing the log data. + """ + resource_logs: Dict[Any, Dict[Any, List[Any]]] = defaultdict( + lambda: defaultdict(list) + ) + + for log_data in logs: + resource = log_data.log_record.resource + instrumentation = log_data.instrumentation_scope or None + json_log = self._encode_log(log_data) + + resource_logs[resource][instrumentation].append(json_log) + + json_resource_logs = [] + + for resource, instrumentations in resource_logs.items(): + scope_logs = [] + for instrumentation, json_logs in instrumentations.items(): + if isinstance(instrumentation, InstrumentationScope): + scope = dict( + name=instrumentation.name, + version=instrumentation.version, + schema_url=instrumentation.schema_url, + attributes=self._encode_attributes( + instrumentation.attributes + ) + if instrumentation.attributes + else None, + ) + else: + scope = None + + scope_logs.append( + dict( + scope=scope, + log_records=json_logs, + schema_url=instrumentation.schema_url + if instrumentation + else None, + attributes=self._encode_attributes( + instrumentation.attributes + ) + if instrumentation.attributes + else None, + ) + ) + + json_resource_logs.append( + dict( + resource=dict( + attributes=self._encode_attributes(resource.attributes) + if resource.attributes + else None, + ), + scope_logs=scope_logs, + schema_url=resource.schema_url, + ) + ) + + return dict(resource_logs=json_resource_logs) + + def export(self, batch: Sequence["LogData"]) -> LogExportResult: + """Export a batch of logs to the OpenTelemetry backend. + + Args: + batch: The batch of logs to export. + + Returns: + LogExportResult indicating success or failure. + """ + if self._shutdown: + logger.warning("Exporter already shutdown, ignoring batch") + return LogExportResult.FAILURE + + encoded_logs = self._encode_logs(batch) + + serialized_data = json.dumps( + encoded_logs, + default=pydantic_encoder, + ).encode("utf-8") + + try: + resp = self._export(serialized_data, self._timeout) + if resp.ok: + return LogExportResult.SUCCESS + return LogExportResult.FAILURE + except Exception as e: + logger.error(f"Error exporting logs: {e}") + return LogExportResult.FAILURE + + def shutdown(self) -> None: + """Shutdown the exporter.""" + if self._shutdown: + logger.warning("Exporter already shutdown, ignoring call") + return + self._shutdown = True + self._shutdown_is_occuring.set() + self._session.close() From 8857797d07e66960b5936e37a61b66f748bbe1df Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Thu, 4 Dec 2025 11:43:49 +0100 Subject: [PATCH 66/81] Fix docstrings and spelling errors --- src/zenml/log_stores/artifact/artifact_log_exporter.py | 3 --- src/zenml/log_stores/otel/otel_log_exporter.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 7a363703053..4e7deb00308 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -305,9 +305,6 @@ def _merge(self, log_uri: str) -> None: Args: log_uri: The URI of the log files to merge. - - Raises: - RuntimeError: If the log model has no URI, cannot merge logs. """ from zenml.artifacts.utils import _load_file_from_artifact_store from zenml.exceptions import DoesNotExistException diff --git a/src/zenml/log_stores/otel/otel_log_exporter.py b/src/zenml/log_stores/otel/otel_log_exporter.py index 81503899d48..0d7d8b5e8d1 100644 --- a/src/zenml/log_stores/otel/otel_log_exporter.py +++ b/src/zenml/log_stores/otel/otel_log_exporter.py @@ -74,7 +74,7 @@ def __init__( timeout: The timeout to use for the export. compression: The compression to use for the export. """ - self._shutdown_is_occuring = threading.Event() + self._shutdown_is_occurring = threading.Event() self._endpoint = endpoint self._certificate_file = certificate_file self._client_key_file = client_key_file @@ -386,5 +386,5 @@ def shutdown(self) -> None: logger.warning("Exporter already shutdown, ignoring call") return self._shutdown = True - self._shutdown_is_occuring.set() + self._shutdown_is_occurring.set() self._session.close() From 3d59d0008e147cd9f629bb19edc31c6925274eee Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Thu, 4 Dec 2025 11:46:57 +0100 Subject: [PATCH 67/81] Fix linter errors --- src/zenml/log_stores/otel/otel_log_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zenml/log_stores/otel/otel_log_exporter.py b/src/zenml/log_stores/otel/otel_log_exporter.py index 0d7d8b5e8d1..21e61dc6c74 100644 --- a/src/zenml/log_stores/otel/otel_log_exporter.py +++ b/src/zenml/log_stores/otel/otel_log_exporter.py @@ -16,7 +16,6 @@ import json import threading from collections import defaultdict -from enum import StrEnum from io import BytesIO from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Sequence @@ -28,6 +27,7 @@ from zenml import __version__ as zenml_version from zenml.logger import get_logger +from zenml.utils.enum_utils import StrEnum from zenml.utils.json_utils import pydantic_encoder DEFAULT_TIMEOUT = 10 From 55563b3c9853dd23bc8ccd4072367bed08d802f7 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 13:12:29 +0100 Subject: [PATCH 68/81] fixing the unit tests --- tests/unit/deployers/server/test_service_outputs.py | 4 ++++ tests/unit/orchestrators/test_step_runner.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/tests/unit/deployers/server/test_service_outputs.py b/tests/unit/deployers/server/test_service_outputs.py index 64c90ac580e..4e61bf8fee7 100644 --- a/tests/unit/deployers/server/test_service_outputs.py +++ b/tests/unit/deployers/server/test_service_outputs.py @@ -78,6 +78,10 @@ def __init__(self) -> None: self.id = uuid4() self.name = "test-run" self.log_collection = None + self.project = SimpleNamespace(id=uuid4(), name="test-project") + self.pipeline = None + self.stack = None + self.user = None class _DummyDeploymentAppRunnerFlavor(BaseDeploymentAppRunnerFlavor): diff --git a/tests/unit/orchestrators/test_step_runner.py b/tests/unit/orchestrators/test_step_runner.py index a13338c498d..d6ff54a7d13 100644 --- a/tests/unit/orchestrators/test_step_runner.py +++ b/tests/unit/orchestrators/test_step_runner.py @@ -61,6 +61,10 @@ def test_running_a_successful_step( mock_publish_successful_step_run = mocker.patch( "zenml.orchestrators.step_runner.publish_successful_step_run" ) + mocker.patch( + "zenml.orchestrators.step_runner.setup_step_logging", + return_value=mocker.MagicMock(__enter__=lambda s: None, __exit__=lambda s, *a: None), + ) step = Step.model_validate( { @@ -120,6 +124,10 @@ def test_running_a_failing_step( mock_publish_successful_step_run = mocker.patch( "zenml.orchestrators.step_runner.publish_successful_step_run" ) + mocker.patch( + "zenml.orchestrators.step_runner.setup_step_logging", + return_value=mocker.MagicMock(__enter__=lambda s: None, __exit__=lambda s, *a: None), + ) step = Step.model_validate( { From 14c2f0005500c664269322e172ded58c8fc27c20 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 13:42:11 +0100 Subject: [PATCH 69/81] format --- tests/unit/orchestrators/test_step_runner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/unit/orchestrators/test_step_runner.py b/tests/unit/orchestrators/test_step_runner.py index d6ff54a7d13..4b487d401cf 100644 --- a/tests/unit/orchestrators/test_step_runner.py +++ b/tests/unit/orchestrators/test_step_runner.py @@ -63,7 +63,9 @@ def test_running_a_successful_step( ) mocker.patch( "zenml.orchestrators.step_runner.setup_step_logging", - return_value=mocker.MagicMock(__enter__=lambda s: None, __exit__=lambda s, *a: None), + return_value=mocker.MagicMock( + __enter__=lambda s: None, __exit__=lambda s, *a: None + ), ) step = Step.model_validate( @@ -126,7 +128,9 @@ def test_running_a_failing_step( ) mocker.patch( "zenml.orchestrators.step_runner.setup_step_logging", - return_value=mocker.MagicMock(__enter__=lambda s: None, __exit__=lambda s, *a: None), + return_value=mocker.MagicMock( + __enter__=lambda s: None, __exit__=lambda s, *a: None + ), ) step = Step.model_validate( From 7bbac462d4ee9c18a59b0caa37f6558853eb1f84 Mon Sep 17 00:00:00 2001 From: Stefan Nica Date: Thu, 4 Dec 2025 16:20:23 +0100 Subject: [PATCH 70/81] Improved otel exporter to use correct fields --- .../log_stores/datadog/datadog_log_store.py | 52 +++++-------------- .../log_stores/otel/otel_log_exporter.py | 4 +- 2 files changed, 16 insertions(+), 40 deletions(-) diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 6914af9044b..49823b48b2d 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -56,7 +56,7 @@ def get_exporter(self) -> OTLPLogExporter: """ if not self._otlp_exporter: self._otlp_exporter = OTLPLogExporter( - endpoint=f"https://http-intake.logs.{self.config.site}/v1/logs", + endpoint=f"https://otlp.{self.config.site}/v1/logs", headers={"dd-api-key": self.config.api_key.get_secret_value()}, ) return self._otlp_exporter @@ -148,15 +148,21 @@ def fetch( exc_stacktrace = exc_info.get("stacktrace") message += f"\n{exc_type}: {exc_message}\n{exc_stacktrace}" + timestamp = datetime.fromisoformat( + log_fields["timestamp"].replace("Z", "+00:00") + ) + severity = log_fields.get("status", "info").upper() + log_severity = ( + LoggingLevels[severity] + if severity in LoggingLevels.__members__ + else LoggingLevels.INFO + ) + # Parse log entry entry = LogEntry( message=message, - level=self._parse_log_level(log_fields.get("status")), - timestamp=datetime.fromisoformat( - log_fields["timestamp"].replace("Z", "+00:00") - ) - if "timestamp" in log_fields - else None, + level=log_severity, + timestamp=timestamp, ) log_entries.append(entry) @@ -165,39 +171,9 @@ def fetch( return log_entries except Exception as e: - logger.error(f"Error fetching logs from Datadog: {e}") + logger.exception(f"Error fetching logs from Datadog: {e}") return [] - def _parse_log_level( - self, status: Optional[str] - ) -> Optional["LoggingLevels"]: - """Parse Datadog log status to ZenML log level. - - Args: - status: Datadog log status string. - - Returns: - ZenML LoggingLevels enum value. - """ - from zenml.enums import LoggingLevels - - if not status: - return None - - status_upper = status.upper() - if status_upper in ["DEBUG", "TRACE"]: - return LoggingLevels.DEBUG - elif status_upper in ["INFO", "INFORMATION"]: - return LoggingLevels.INFO - elif status_upper in ["WARN", "WARNING"]: - return LoggingLevels.WARN - elif status_upper == "ERROR": - return LoggingLevels.ERROR - elif status_upper in ["CRITICAL", "FATAL", "EMERGENCY"]: - return LoggingLevels.CRITICAL - else: - return LoggingLevels.INFO - def cleanup(self) -> None: """Cleanup the Datadog log store. diff --git a/src/zenml/log_stores/otel/otel_log_exporter.py b/src/zenml/log_stores/otel/otel_log_exporter.py index 21e61dc6c74..57995085599 100644 --- a/src/zenml/log_stores/otel/otel_log_exporter.py +++ b/src/zenml/log_stores/otel/otel_log_exporter.py @@ -17,6 +17,7 @@ import threading from collections import defaultdict from io import BytesIO +from time import time_ns from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Sequence import requests @@ -265,7 +266,7 @@ def _encode_log(cls, log_data: "LogData") -> Dict[str, Any]: body = log_data.log_record.body log_record = dict( time_unix_nano=log_data.log_record.timestamp, - observed_time_unix_nano=log_data.log_record.observed_timestamp, + observed_time_unix_nano=time_ns(), span_id=span_id, trace_id=trace_id, flags=int(log_data.log_record.trace_flags), @@ -363,7 +364,6 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: if self._shutdown: logger.warning("Exporter already shutdown, ignoring batch") return LogExportResult.FAILURE - encoded_logs = self._encode_logs(batch) serialized_data = json.dumps( From fd0368d52dfd49e362b36df126ea16ce0767908e Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 16:54:34 +0100 Subject: [PATCH 71/81] small fix to the runner --- src/zenml/execution/pipeline/dynamic/runner.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index 884e9c15652..91dfe86acbf 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -57,6 +57,7 @@ from zenml.models import ( ArtifactVersionResponse, PipelineRunResponse, + PipelineRunUpdate, PipelineSnapshotResponse, ) from zenml.orchestrators.publish_utils import ( @@ -157,10 +158,18 @@ def pipeline(self) -> "DynamicPipeline": def run_pipeline(self) -> None: """Run the pipeline.""" with InMemoryArtifactCache(): - run = self._run or create_placeholder_run( - snapshot=self._snapshot, - orchestrator_run_id=self._orchestrator_run_id, - ) + if self._run: + run = Client().zen_store.update_run( + run_id=self._run.id, + run_update=PipelineRunUpdate( + orchestrator_run_id=self._orchestrator_run_id, + ), + ) + else: + run = create_placeholder_run( + snapshot=self._snapshot, + orchestrator_run_id=self._orchestrator_run_id, + ) logging_context = nullcontext() if is_pipeline_logging_enabled( From 1d0c7bf7c3587e29f8ec4f4bbf09022896fc6bc3 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 16:56:56 +0100 Subject: [PATCH 72/81] one more --- .../execution/pipeline/dynamic/runner.py | 40 +++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py index 91dfe86acbf..f0bd66b4bf1 100644 --- a/src/zenml/execution/pipeline/dynamic/runner.py +++ b/src/zenml/execution/pipeline/dynamic/runner.py @@ -157,30 +157,28 @@ def pipeline(self) -> "DynamicPipeline": def run_pipeline(self) -> None: """Run the pipeline.""" - with InMemoryArtifactCache(): - if self._run: - run = Client().zen_store.update_run( - run_id=self._run.id, - run_update=PipelineRunUpdate( - orchestrator_run_id=self._orchestrator_run_id, - ), - ) - else: - run = create_placeholder_run( - snapshot=self._snapshot, + if self._run: + run = Client().zen_store.update_run( + run_id=self._run.id, + run_update=PipelineRunUpdate( orchestrator_run_id=self._orchestrator_run_id, - ) + ), + ) + else: + run = create_placeholder_run( + snapshot=self._snapshot, + orchestrator_run_id=self._orchestrator_run_id, + ) - logging_context = nullcontext() - if is_pipeline_logging_enabled( - self._snapshot.pipeline_configuration - ): - logging_context = setup_run_logging( - pipeline_run=run, - source="orchestrator", - ) + logging_context = nullcontext() + if is_pipeline_logging_enabled(self._snapshot.pipeline_configuration): + logging_context = setup_run_logging( + pipeline_run=run, + source="orchestrator", + ) - with logging_context: + with logging_context: + with InMemoryArtifactCache(): with DynamicPipelineRunContext( pipeline=self.pipeline, run=run, From 526eb3183fedb8e0fd703416108f02823b825864 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 17:10:08 +0100 Subject: [PATCH 73/81] removed todo --- src/zenml/log_stores/datadog/datadog_flavor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index c9b7a1677e6..7a6d912c233 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -75,7 +75,6 @@ def sdk_docs_url(self) -> str: """ return self.docs_url - # TODO: Add logo for the Datadog log store @property def logo_url(self) -> str: """URL to the flavor logo. From 5b84a26c26bf7ef5a89033ee7f8f1f92796460d3 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 17:32:51 +0100 Subject: [PATCH 74/81] more minor fixes --- src/zenml/constants.py | 4 ++-- src/zenml/logger.py | 8 ++++++++ src/zenml/zen_stores/schemas/step_run_schemas.py | 6 +++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/zenml/constants.py b/src/zenml/constants.py index 893506c255c..28376703367 100644 --- a/src/zenml/constants.py +++ b/src/zenml/constants.py @@ -228,14 +228,14 @@ def handle_int_env_var(var: str, default: int = 0) -> int: ENV_ZENML_LOGGING_VERBOSITY, default="DEBUG" ).upper() ZENML_STORAGE_LOGGING_VERBOSITY = os.getenv( - ENV_ZENML_STORAGE_LOGGING_VERBOSITY, default="DEBUG" + ENV_ZENML_STORAGE_LOGGING_VERBOSITY, default=None ).upper() else: ZENML_LOGGING_VERBOSITY = os.getenv( ENV_ZENML_LOGGING_VERBOSITY, default="INFO" ).upper() ZENML_STORAGE_LOGGING_VERBOSITY = os.getenv( - ENV_ZENML_STORAGE_LOGGING_VERBOSITY, default="INFO" + ENV_ZENML_STORAGE_LOGGING_VERBOSITY, default=None ).upper() INSIDE_ZENML_CONTAINER = handle_bool_env_var(ENV_ZENML_CONTAINER, False) diff --git a/src/zenml/logger.py b/src/zenml/logger.py index bf386495cd4..b606b05e127 100644 --- a/src/zenml/logger.py +++ b/src/zenml/logger.py @@ -27,6 +27,7 @@ ENV_ZENML_LOGGING_COLORS_DISABLED, ENV_ZENML_SUPPRESS_LOGS, ZENML_LOGGING_VERBOSITY, + ZENML_STORAGE_LOGGING_VERBOSITY, handle_bool_env_var, ) from zenml.enums import LoggingLevels @@ -225,6 +226,13 @@ def get_logging_level() -> LoggingLevels: raise KeyError( f"Verbosity must be one of {list(LoggingLevels.__members__.keys())}" ) + + if ZENML_STORAGE_LOGGING_VERBOSITY is not None: + get_logger(__name__).warning( + "The ZENML_STORAGE_LOGGING_VERBOSITY is no longer supported. " + "Please use the ZENML_LOGGING_VERBOSITY instead." + ) + return LoggingLevels[verbosity] diff --git a/src/zenml/zen_stores/schemas/step_run_schemas.py b/src/zenml/zen_stores/schemas/step_run_schemas.py index c539e3b18d5..bc5cb209d78 100644 --- a/src/zenml/zen_stores/schemas/step_run_schemas.py +++ b/src/zenml/zen_stores/schemas/step_run_schemas.py @@ -280,8 +280,8 @@ def get_query_options( # if include_metadata: # options.extend( # [ - # # joinedload(jl_arg(StepRunSchema.parents)), - # # joinedload(jl_arg(StepRunSchema.run_metadata)), + # joinedload(jl_arg(StepRunSchema.parents)), + # joinedload(jl_arg(StepRunSchema.run_metadata)), # ] # ) @@ -485,7 +485,7 @@ def to_model( ) # Add the step logs as "logs" if they exist, for backwards compatibility - # TODO: This will be safe to remove in future releases (>0.84.0). + # TODO: This will be safe to remove in future releases (>0.93.0). step_logs = [ log_entry for log_entry in self.logs From a41f4e8b5d3c2da0a99db00da424f46de6ca492f Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 17:41:38 +0100 Subject: [PATCH 75/81] sql zen store changes --- src/zenml/zen_stores/sql_zen_store.py | 100 +++++++++++++++++--------- 1 file changed, 68 insertions(+), 32 deletions(-) diff --git a/src/zenml/zen_stores/sql_zen_store.py b/src/zenml/zen_stores/sql_zen_store.py index c578d92d59a..eb5c463368c 100644 --- a/src/zenml/zen_stores/sql_zen_store.py +++ b/src/zenml/zen_stores/sql_zen_store.py @@ -6566,13 +6566,22 @@ def _create_run( # Add logs entry for the run if exists if pipeline_run.logs is not None: - self._get_reference_schema_by_id( - resource=pipeline_run, - reference_schema=StackComponentSchema, - reference_id=pipeline_run.logs.artifact_store_id, - session=session, - reference_type="logs artifact store", - ) + if pipeline_run.logs.artifact_store_id: + self._get_reference_schema_by_id( + resource=pipeline_run, + reference_schema=StackComponentSchema, + reference_id=pipeline_run.logs.artifact_store_id, + session=session, + reference_type="logs artifact store", + ) + else: + self._get_reference_schema_by_id( + resource=pipeline_run, + reference_schema=StackComponentSchema, + reference_id=pipeline_run.logs.log_store_id, + session=session, + reference_type="logs log store", + ) log_entry = LogsSchema( id=pipeline_run.logs.id, @@ -7051,13 +7060,22 @@ def update_run( try: for log_request in run_update.add_logs: # Validate the artifact store exists - self._get_reference_schema_by_id( - resource=log_request, - reference_schema=StackComponentSchema, - reference_id=log_request.artifact_store_id, - session=session, - reference_type="logs artifact store", - ) + if log_request.artifact_store_id: + self._get_reference_schema_by_id( + resource=log_request, + reference_schema=StackComponentSchema, + reference_id=log_request.artifact_store_id, + session=session, + reference_type="logs artifact store", + ) + else: + self._get_reference_schema_by_id( + resource=log_request, + reference_schema=StackComponentSchema, + reference_id=log_request.log_store_id, + session=session, + reference_type="logs log store", + ) # Create the log entry log_entry = LogsSchema( @@ -10111,20 +10129,29 @@ def create_run_step(self, step_run: StepRunRequest) -> StepRunResponse: # Add logs entry for the step if exists if step_run.logs is not None: - self._get_reference_schema_by_id( - resource=step_run, - reference_schema=StackComponentSchema, - reference_id=step_run.logs.artifact_store_id, - session=session, - reference_type="logs artifact store", - ) + if step_run.logs.artifact_store_id: + self._get_reference_schema_by_id( + resource=step_run, + reference_schema=StackComponentSchema, + reference_id=step_run.logs.artifact_store_id, + session=session, + reference_type="logs artifact store", + ) + else: + self._get_reference_schema_by_id( + resource=step_run, + reference_schema=StackComponentSchema, + reference_id=step_run.logs.log_store_id, + session=session, + reference_type="logs log store", + ) log_entry = LogsSchema( id=step_run.logs.id, uri=step_run.logs.uri, # TODO: Remove fallback when not supporting - # clients <0.84.0 anymore - source=step_run.logs.source or "execution", + # clients <0.93.0 anymore + source=step_run.logs.source or "step", step_run_id=step_schema.id, artifact_store_id=step_run.logs.artifact_store_id, log_store_id=step_run.logs.log_store_id, @@ -10520,21 +10547,30 @@ def update_run_step( try: for log_request in step_run_update.add_logs: # Validate the artifact store exists - self._get_reference_schema_by_id( - resource=log_request, - reference_schema=StackComponentSchema, - reference_id=log_request.artifact_store_id, - session=session, - reference_type="logs artifact store", - ) + if log_request.artifact_store_id: + self._get_reference_schema_by_id( + resource=log_request, + reference_schema=StackComponentSchema, + reference_id=log_request.artifact_store_id, + session=session, + reference_type="logs artifact store", + ) + else: + self._get_reference_schema_by_id( + resource=log_request, + reference_schema=StackComponentSchema, + reference_id=log_request.log_store_id, + session=session, + reference_type="logs log store", + ) # Create the log entry log_entry = LogsSchema( id=log_request.id, uri=log_request.uri, # TODO: Remove fallback when not supporting - # clients <0.84.0 anymore - source=log_request.source or "execution", + # clients <0.93.0 anymore + source=log_request.source or "step", step_run_id=existing_step_run.id, artifact_store_id=log_request.artifact_store_id, log_store_id=log_request.log_store_id, From 3725797c91f80e046da949c5af227816eaec73f1 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 17:48:08 +0100 Subject: [PATCH 76/81] more minor fixes --- src/zenml/log_stores/artifact/artifact_log_store.py | 4 +--- src/zenml/log_stores/otel/otel_flavor.py | 1 + src/zenml/zen_stores/sql_zen_store.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index 2d030913139..bc1d1cbae09 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -32,9 +32,7 @@ from zenml.exceptions import DoesNotExistException from zenml.log_stores.base_log_store import MAX_ENTRIES_PER_REQUEST from zenml.log_stores.otel.otel_flavor import OtelLogStoreConfig -from zenml.log_stores.otel.otel_log_store import ( - OtelLogStore, -) +from zenml.log_stores.otel.otel_log_store import OtelLogStore from zenml.logger import get_logger from zenml.models import LogsResponse from zenml.utils.io_utils import sanitize_remote_path diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index f0338bd25c2..c4ad17a083d 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -24,6 +24,7 @@ class OtelLogStoreConfig(BaseLogStoreConfig): Attributes: service_name: Name of the service (defaults to "zenml"). + service_version: Version of the service (defaults to the ZenML version). max_queue_size: Maximum queue size for batch processor. schedule_delay_millis: Delay between batch exports in milliseconds. max_export_batch_size: Maximum batch size for exports. diff --git a/src/zenml/zen_stores/sql_zen_store.py b/src/zenml/zen_stores/sql_zen_store.py index eb5c463368c..212ae690dca 100644 --- a/src/zenml/zen_stores/sql_zen_store.py +++ b/src/zenml/zen_stores/sql_zen_store.py @@ -6574,7 +6574,7 @@ def _create_run( session=session, reference_type="logs artifact store", ) - else: + else: self._get_reference_schema_by_id( resource=pipeline_run, reference_schema=StackComponentSchema, From 8c89ef1c1efa59e87d916cdf83b0351c31de7173 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Thu, 4 Dec 2025 18:00:32 +0100 Subject: [PATCH 77/81] another small fix --- src/zenml/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zenml/constants.py b/src/zenml/constants.py index 28376703367..df868720fee 100644 --- a/src/zenml/constants.py +++ b/src/zenml/constants.py @@ -229,14 +229,14 @@ def handle_int_env_var(var: str, default: int = 0) -> int: ).upper() ZENML_STORAGE_LOGGING_VERBOSITY = os.getenv( ENV_ZENML_STORAGE_LOGGING_VERBOSITY, default=None - ).upper() + ) else: ZENML_LOGGING_VERBOSITY = os.getenv( ENV_ZENML_LOGGING_VERBOSITY, default="INFO" ).upper() ZENML_STORAGE_LOGGING_VERBOSITY = os.getenv( ENV_ZENML_STORAGE_LOGGING_VERBOSITY, default=None - ).upper() + ) INSIDE_ZENML_CONTAINER = handle_bool_env_var(ENV_ZENML_CONTAINER, False) From 3562adef93e594fd18dfe6e241231b8019d52ea8 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Fri, 5 Dec 2025 02:17:17 +0100 Subject: [PATCH 78/81] minor fixes --- .../log_stores/artifact/artifact_log_exporter.py | 15 ++++++++------- .../log_stores/artifact/artifact_log_store.py | 11 +++++++---- src/zenml/log_stores/datadog/datadog_flavor.py | 4 ++-- .../log_stores/datadog/datadog_log_store.py | 5 ++--- src/zenml/log_stores/otel/otel_log_store.py | 16 +++++++--------- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index 4e7deb00308..eab843f046b 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -72,7 +72,8 @@ def export(self, batch: Sequence["LogData"]) -> LogExportResult: attrs = log_data.log_record.attributes if not attrs: continue - log_uri = attrs.get("zenml.log_uri") + + log_uri = attrs.get("zenml.log_model.uri") if not log_uri or not isinstance(log_uri, str): continue @@ -110,13 +111,13 @@ def _otel_record_to_log_entries( List of LogEntry objects (multiple if message was chunked). """ log_record = log_data.log_record + attributes = log_record.attributes + message = str(log_record.body) if log_record.body else "" - if log_record.attributes and log_record.attributes.get( - "exception.message" - ): - exc_message = log_record.attributes.get("exception.message") - exc_type = log_record.attributes.get("exception.type") - exc_stacktrace = log_record.attributes.get("exception.stacktrace") + if attributes and attributes.get("exception.message"): + exc_message = attributes.get("exception.message") + exc_type = attributes.get("exception.type") + exc_stacktrace = attributes.get("exception.stacktrace") message += f"\n{exc_type}: {exc_message}\n{exc_stacktrace}" message = remove_ansi_escape_codes(message).rstrip() diff --git a/src/zenml/log_stores/artifact/artifact_log_store.py b/src/zenml/log_stores/artifact/artifact_log_store.py index bc1d1cbae09..d7db8a21b4d 100644 --- a/src/zenml/log_stores/artifact/artifact_log_store.py +++ b/src/zenml/log_stores/artifact/artifact_log_store.py @@ -85,12 +85,14 @@ def remove_ansi_escape_codes(text: str) -> str: def fetch_log_records( artifact_store: "BaseArtifactStore", logs_uri: str, + limit: int = MAX_ENTRIES_PER_REQUEST, ) -> List[LogEntry]: """Fetches log entries. Args: artifact_store: The artifact store. logs_uri: The URI of the artifact (file or directory). + limit: Maximum number of log entries to return. Returns: List of log entries. @@ -101,7 +103,7 @@ def fetch_log_records( if log_entry := parse_log_entry(line): log_entries.append(log_entry) - if len(log_entries) >= MAX_ENTRIES_PER_REQUEST: + if len(log_entries) >= limit: break return log_entries @@ -279,8 +281,8 @@ def finalize( self._logger.emit( body=END_OF_STREAM_MESSAGE, attributes={ - "zenml.log_id": str(log_model.id), - "zenml.log_uri": str(log_model.uri), + "zenml.log_model.id": str(log_model.id), + "zenml.log_model.uri": str(log_model.uri), }, ) @@ -331,9 +333,10 @@ def fetch( log_entries = fetch_log_records( artifact_store=self._artifact_store, logs_uri=logs_model.uri, + limit=limit, ) - return log_entries[:limit] + return log_entries def cleanup(self) -> None: """Cleanup the artifact log store. diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index 7a6d912c233..035bf0192d9 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -64,7 +64,7 @@ def docs_url(self) -> str: Returns: The URL to the flavor documentation. """ - return "https://docs.zenml.io/stack-components/log-stores/datadog" + return self.generate_default_docs_url() @property def sdk_docs_url(self) -> str: @@ -73,7 +73,7 @@ def sdk_docs_url(self) -> str: Returns: The URL to the SDK docs for this flavor. """ - return self.docs_url + return self.generate_default_sdk_docs_url() @property def logo_url(self) -> str: diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index 49823b48b2d..c2fc583828e 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -83,15 +83,14 @@ def fetch( Returns: List of log entries from Datadog. """ - # Build query query_parts = [ f"service:{self.config.service_name}", - f"@zenml.log_id:{logs_model.id}", + f"service.version:{self.config.service_version}", + f"@zenml.log_model.id:{logs_model.id}", ] query = " ".join(query_parts) - # Build API request api_endpoint = ( f"https://api.{self.config.site}/api/v2/logs/events/search" ) diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index 8deb8cdf6ac..c4d4e165814 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -136,21 +136,19 @@ def emit( attributes = emit_kwargs.get("attributes", {}) - zenml_log_metadata = { - f"zenml.{key}": value for key, value in metadata.items() - } - attributes.update( { - "zenml.log_id": str(log_model.id), "zenml.log_store_id": str(self.id), - **zenml_log_metadata, + "zenml.log_model.id": str(log_model.id), + "zenml.log_model.uri": str(log_model.uri), + "zenml.log_model.artifact_store_id": str( + log_model.artifact_store_id + ), + "zenml.log_model.source": log_model.source, + **{f"zenml.{key}": value for key, value in metadata.items()}, } ) - if log_model.uri: - attributes["zenml.log_uri"] = log_model.uri - self._logger.emit(**emit_kwargs) def finalize( From 11def579fca4f9c0ee47bb035a0b99b46fc2ce50 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Fri, 5 Dec 2025 03:15:49 +0100 Subject: [PATCH 79/81] better log entry fetching --- .../log_stores/datadog/datadog_log_store.py | 43 +++++++++++++++---- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/src/zenml/log_stores/datadog/datadog_log_store.py b/src/zenml/log_stores/datadog/datadog_log_store.py index c2fc583828e..2520f14b025 100644 --- a/src/zenml/log_stores/datadog/datadog_log_store.py +++ b/src/zenml/log_stores/datadog/datadog_log_store.py @@ -13,7 +13,7 @@ # permissions and limitations under the License. """Datadog log store implementation.""" -from datetime import datetime +from datetime import datetime, timezone from typing import Any, Dict, List, Optional, cast import requests @@ -85,7 +85,6 @@ def fetch( """ query_parts = [ f"service:{self.config.service_name}", - f"service.version:{self.config.service_version}", f"@zenml.log_model.id:{logs_model.id}", ] @@ -117,7 +116,7 @@ def fetch( "page": { "limit": min(limit, 1000), # Datadog API limit }, - "sort": "timestamp", + "sort": "@otel.timestamp", } try: @@ -140,16 +139,33 @@ def fetch( for log in data.get("data", []): log_fields = log.get("attributes", {}) message = log_fields.get("message", "") - attributes = log_fields.get("attributes", {}) - if exc_info := attributes.get("exception"): + nested_attrs = log_fields.get("attributes", {}) + + if exc_info := nested_attrs.get("exception"): exc_message = exc_info.get("message") exc_type = exc_info.get("type") exc_stacktrace = exc_info.get("stacktrace") message += f"\n{exc_type}: {exc_message}\n{exc_stacktrace}" - timestamp = datetime.fromisoformat( - log_fields["timestamp"].replace("Z", "+00:00") - ) + code_info = nested_attrs.get("code", {}) + filename = code_info.get("file", {}).get("path") + lineno = code_info.get("line", {}).get("number") + function_name = code_info.get("function", {}).get("name") + + otel_info = nested_attrs.get("otel", {}) + logger_name = otel_info.get("library", {}).get("name") + + timestamp_ns_str = otel_info.get("timestamp") + if timestamp_ns_str: + timestamp_ns = int(timestamp_ns_str) + timestamp = datetime.fromtimestamp( + timestamp_ns / 1e9, tz=timezone.utc + ) + else: + timestamp = datetime.fromisoformat( + log_fields["timestamp"].replace("Z", "+00:00") + ) + severity = log_fields.get("status", "info").upper() log_severity = ( LoggingLevels[severity] @@ -157,11 +173,20 @@ def fetch( else LoggingLevels.INFO ) - # Parse log entry + module = None + if function_name: + module = function_name + elif filename: + module = filename.rsplit("/", 1)[-1].replace(".py", "") + entry = LogEntry( message=message, level=log_severity, timestamp=timestamp, + name=logger_name, + filename=filename, + lineno=lineno, + module=module, ) log_entries.append(entry) From 8dfcedfbc66cd8d26b9ca2dedc7a13a5c5fe8b09 Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sat, 6 Dec 2025 01:20:16 +0100 Subject: [PATCH 80/81] late night changes --- src/zenml/constants.py | 25 +++++++++++++++++++ .../artifact/artifact_log_exporter.py | 6 +++++ src/zenml/log_stores/otel/otel_flavor.py | 16 +++++++++--- src/zenml/log_stores/otel/otel_log_store.py | 8 +++++- 4 files changed, 51 insertions(+), 4 deletions(-) diff --git a/src/zenml/constants.py b/src/zenml/constants.py index df868720fee..47fec31011f 100644 --- a/src/zenml/constants.py +++ b/src/zenml/constants.py @@ -547,3 +547,28 @@ def handle_int_env_var(var: str, default: int = 0) -> int: LOGS_MERGE_INTERVAL_SECONDS = handle_int_env_var( ENV_ZENML_LOGS_MERGE_INTERVAL_SECONDS, default=10 * 60 ) + +# OpenTelemetry log storage constants +ENV_ZENML_LOGS_OTEL_MAX_QUEUE_SIZE = "ZENML_LOGS_OTEL_MAX_QUEUE_SIZE" +ENV_ZENML_LOGS_OTEL_SCHEDULE_DELAY_MILLIS = ( + "ZENML_LOGS_OTEL_SCHEDULE_DELAY_MILLIS" +) +ENV_ZENML_LOGS_OTEL_MAX_EXPORT_BATCH_SIZE = ( + "ZENML_LOGS_OTEL_MAX_EXPORT_BATCH_SIZE" +) +ENV_ZENML_LOGS_OTEL_EXPORT_TIMEOUT_MILLIS = ( + "ZENML_LOGS_OTEL_EXPORT_TIMEOUT_MILLIS" +) + +LOGS_OTEL_MAX_QUEUE_SIZE = handle_int_env_var( + ENV_ZENML_LOGS_OTEL_MAX_QUEUE_SIZE, default=100000 +) +LOGS_OTEL_SCHEDULE_DELAY_MILLIS = handle_int_env_var( + ENV_ZENML_LOGS_OTEL_SCHEDULE_DELAY_MILLIS, default=5000 +) +LOGS_OTEL_MAX_EXPORT_BATCH_SIZE = handle_int_env_var( + ENV_ZENML_LOGS_OTEL_MAX_EXPORT_BATCH_SIZE, default=5000 +) +LOGS_OTEL_EXPORT_TIMEOUT_MILLIS = handle_int_env_var( + ENV_ZENML_LOGS_OTEL_EXPORT_TIMEOUT_MILLIS, default=15000 +) diff --git a/src/zenml/log_stores/artifact/artifact_log_exporter.py b/src/zenml/log_stores/artifact/artifact_log_exporter.py index eab843f046b..a1869461f59 100644 --- a/src/zenml/log_stores/artifact/artifact_log_exporter.py +++ b/src/zenml/log_stores/artifact/artifact_log_exporter.py @@ -310,6 +310,12 @@ def _merge(self, log_uri: str) -> None: from zenml.artifacts.utils import _load_file_from_artifact_store from zenml.exceptions import DoesNotExistException + # Check if the log directory exists - it may not if no logs + # were written yet. The URI folder gets created only when the + # first log message is sent. + if not self.artifact_store.exists(log_uri): + return + files_ = self.artifact_store.listdir(log_uri) if len(files_) > 1: files_.sort() diff --git a/src/zenml/log_stores/otel/otel_flavor.py b/src/zenml/log_stores/otel/otel_flavor.py index c4ad17a083d..e392ffad04c 100644 --- a/src/zenml/log_stores/otel/otel_flavor.py +++ b/src/zenml/log_stores/otel/otel_flavor.py @@ -16,6 +16,12 @@ from pydantic import Field from zenml import __version__ +from zenml.constants import ( + LOGS_OTEL_EXPORT_TIMEOUT_MILLIS, + LOGS_OTEL_MAX_EXPORT_BATCH_SIZE, + LOGS_OTEL_MAX_QUEUE_SIZE, + LOGS_OTEL_SCHEDULE_DELAY_MILLIS, +) from zenml.log_stores import BaseLogStoreConfig @@ -39,14 +45,18 @@ class OtelLogStoreConfig(BaseLogStoreConfig): description="Version of the service for telemetry", ) max_queue_size: int = Field( - default=8096, + default=LOGS_OTEL_MAX_QUEUE_SIZE, description="Maximum queue size for batch log processor", ) schedule_delay_millis: int = Field( - default=15000, + default=LOGS_OTEL_SCHEDULE_DELAY_MILLIS, description="Export interval in milliseconds", ) max_export_batch_size: int = Field( - default=512, + default=LOGS_OTEL_MAX_EXPORT_BATCH_SIZE, description="Maximum batch size for exports", ) + export_timeout_millis: int = Field( + default=LOGS_OTEL_EXPORT_TIMEOUT_MILLIS, + description="Timeout for each export batch in milliseconds", + ) diff --git a/src/zenml/log_stores/otel/otel_log_store.py b/src/zenml/log_stores/otel/otel_log_store.py index c4d4e165814..dc6fd3f4ec2 100644 --- a/src/zenml/log_stores/otel/otel_log_store.py +++ b/src/zenml/log_stores/otel/otel_log_store.py @@ -88,7 +88,13 @@ def get_exporter(self) -> "LogExporter": def activate(self) -> None: """Activate log collection with OpenTelemetry.""" self._exporter = self.get_exporter() - self._processor = BatchLogRecordProcessor(self._exporter) + self._processor = BatchLogRecordProcessor( + self._exporter, + max_queue_size=self.config.max_queue_size, + schedule_delay_millis=self.config.schedule_delay_millis, + max_export_batch_size=self.config.max_export_batch_size, + export_timeout_millis=self.config.export_timeout_millis, + ) self._resource = Resource.create( { From 9ef088db42556c781553b57abc1d1038b3c2434e Mon Sep 17 00:00:00 2001 From: Baris Can Durak Date: Sat, 6 Dec 2025 02:05:50 +0100 Subject: [PATCH 81/81] proper limits --- .../log_stores/datadog/datadog_flavor.py | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/zenml/log_stores/datadog/datadog_flavor.py b/src/zenml/log_stores/datadog/datadog_flavor.py index 035bf0192d9..bc75130ce2f 100644 --- a/src/zenml/log_stores/datadog/datadog_flavor.py +++ b/src/zenml/log_stores/datadog/datadog_flavor.py @@ -15,7 +15,7 @@ from typing import Type -from pydantic import Field +from pydantic import Field, field_validator from zenml.enums import StackComponentType from zenml.log_stores import BaseLogStore, BaseLogStoreConfig @@ -23,6 +23,9 @@ from zenml.stack.flavor import Flavor from zenml.utils.secret_utils import PlainSerializedSecretStr +# Datadog API limits: https://docs.datadoghq.com/api/latest/logs/ +DATADOG_MAX_BATCH_SIZE = 1000 + class DatadogLogStoreConfig(OtelLogStoreConfig): """Configuration for Datadog log store. @@ -31,6 +34,7 @@ class DatadogLogStoreConfig(OtelLogStoreConfig): api_key: Datadog API key for log ingestion. application_key: Datadog application key for log extraction. site: Datadog site (e.g., "datadoghq.com", "datadoghq.eu"). + max_export_batch_size: Maximum batch size for exports (Datadog limit: 1000). """ api_key: PlainSerializedSecretStr = Field( @@ -43,6 +47,31 @@ class DatadogLogStoreConfig(OtelLogStoreConfig): default="datadoghq.com", description="Datadog site (e.g., datadoghq.com, datadoghq.eu)", ) + max_export_batch_size: int = Field( + default=500, + description="Maximum batch size for exports (Datadog limit: 1000)", + ) + + @field_validator("max_export_batch_size") + @classmethod + def validate_max_export_batch_size(cls, v: int) -> int: + """Validate that max_export_batch_size doesn't exceed Datadog's limit. + + Args: + v: The value to validate. + + Returns: + The validated value. + + Raises: + ValueError: If the value exceeds Datadog's limit. + """ + if v > DATADOG_MAX_BATCH_SIZE: + raise ValueError( + f"max_export_batch_size cannot exceed {DATADOG_MAX_BATCH_SIZE} " + f"(Datadog API limit). Got: {v}" + ) + return v class DatadogLogStoreFlavor(Flavor):