Skip to content

feat: configure json logger for the whole gitingest module #451

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,5 @@ S3_REGION=us-east-1
S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket
# Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
# S3_DIRECTORY_PREFIX=my-prefix

LOG_FORMAT=JSON
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ repos:
pytest-asyncio,
pytest-mock,
python-dotenv,
python-json-logger,
'sentry-sdk[fastapi]',
slowapi,
starlette>=0.40.0,
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies = [
"pathspec>=0.12.1",
"pydantic",
"python-dotenv",
"python-json-logger",
"starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw)
"strenum; python_version < '3.11'",
"tiktoken>=0.7.0", # Support for o200k_base encoding
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pathspec>=0.12.1
prometheus-client
pydantic
python-dotenv
python-json-logger
sentry-sdk[fastapi]
slowapi
starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw
Expand Down
23 changes: 13 additions & 10 deletions src/gitingest/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
from __future__ import annotations

import asyncio
import logging
from typing import TypedDict

import click
from typing_extensions import Unpack

from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME
from gitingest.entrypoint import ingest_async
from gitingest.logging_config import setup_logging

setup_logging()

logger = logging.getLogger(__name__)


class _CLIArgs(TypedDict):
Expand Down Expand Up @@ -163,9 +169,9 @@ async def _async_main(
output_target = output if output is not None else OUTPUT_FILE_NAME

if output_target == "-":
click.echo("Analyzing source, preparing output for stdout...", err=True)
logger.debug("Analyzing source, preparing output for stdout...")
else:
click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True)
logger.debug("Analyzing source, output will be written to '%s'...", output_target)

summary, _, _ = await ingest_async(
source,
Expand All @@ -180,18 +186,15 @@ async def _async_main(
)
except Exception as exc:
# Convert any exception into Click.Abort so that exit status is non-zero
click.echo(f"Error: {exc}", err=True)
logger.exception("Ingest failed.", exc_info=exc)
raise click.Abort from exc

if output_target == "-": # stdout
click.echo("\n--- Summary ---", err=True)
click.echo(summary, err=True)
click.echo("--- End Summary ---", err=True)
if output_target == "-":
click.echo(f"--- Summary ---\n{summary}\n--- End Summary ---", err=True)
click.echo("Analysis complete! Output sent to stdout.", err=True)
else: # file
else:
click.echo(f"Analysis complete! Output written to: {output_target}")
click.echo("\nSummary:")
click.echo(summary)
click.echo(f"Summary:\n{summary}")


if __name__ == "__main__":
Expand Down
39 changes: 18 additions & 21 deletions src/gitingest/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import asyncio
import errno
import logging
import shutil
import stat
import sys
import warnings
from contextlib import asynccontextmanager
from pathlib import Path
from typing import TYPE_CHECKING, AsyncGenerator, Callable
Expand All @@ -28,6 +28,8 @@

from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)


async def ingest_async(
source: str,
Expand Down Expand Up @@ -209,19 +211,19 @@ def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str
"""
if tag and query.tag and tag != query.tag:
msg = f"Warning: The specified tag '{tag}' overrides the tag found in the URL '{query.tag}'."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

query.tag = tag or query.tag

if branch and query.branch and branch != query.branch:
msg = f"Warning: The specified branch '{branch}' overrides the branch found in the URL '{query.branch}'."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

query.branch = branch or query.branch

if tag and branch:
msg = "Warning: Both tag and branch are specified. The tag will be used."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

# Tag wins over branch if both supplied
if query.tag:
Expand Down Expand Up @@ -300,22 +302,17 @@ def _handle_remove_readonly(


async def _write_output(tree: str, content: str, target: str | None) -> None:
"""Write combined output to ``target`` (``"-"`` ⇒ stdout).

Parameters
----------
tree : str
The tree-like string representation of the file structure.
content : str
The content of the files in the repository or directory.
target : str | None
The path to the output file. If ``None``, the results are not written to a file.

"""
"""Write combined output to ``target`` (``"-"`` ⇒ stdout)."""
data = f"{tree}\n{content}"
loop = asyncio.get_running_loop()
if target == "-":
await loop.run_in_executor(None, sys.stdout.write, data)
await loop.run_in_executor(None, sys.stdout.flush)
elif target is not None:
await loop.run_in_executor(None, Path(target).write_text, data, "utf-8")
try:
if target == "-":
logger.debug("Writing output to stdout.")
await loop.run_in_executor(None, sys.stdout.write, data)
await loop.run_in_executor(None, sys.stdout.flush)
elif target is not None:
logger.debug("Writing output to file: %s", target)
await loop.run_in_executor(None, Path(target).write_text, data, "utf-8")
except Exception as exc:
logger.exception("Failed to write output to %s.", target, exc_info=exc)
raise
17 changes: 10 additions & 7 deletions src/gitingest/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import logging
from pathlib import Path
from typing import TYPE_CHECKING

Expand All @@ -13,6 +14,8 @@
if TYPE_CHECKING:
from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)


def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
"""Run the ingestion process for a parsed query.
Expand Down Expand Up @@ -111,7 +114,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem
_process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
elif sub_path.is_file():
if sub_path.stat().st_size > query.max_file_size:
print(f"Skipping file {sub_path}: would exceed max file size limit")
logger.debug("Skipping file %s: would exceed max file size limit", sub_path)
continue
_process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
elif sub_path.is_dir():
Expand All @@ -133,7 +136,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem
node.file_count += child_directory_node.file_count
node.dir_count += 1 + child_directory_node.dir_count
else:
print(f"Warning: {sub_path} is an unknown file type, skipping")
logger.warning("Warning: %s is an unknown file type, skipping", sub_path)

node.sort_children()

Expand Down Expand Up @@ -186,12 +189,12 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat

"""
if stats.total_files + 1 > MAX_FILES:
print(f"Maximum file limit ({MAX_FILES}) reached")
logger.warning("Maximum file limit (%i) reached", MAX_FILES)
return

file_size = path.stat().st_size
if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES:
print(f"Skipping file {path}: would exceed total size limit")
logger.debug("Skipping file %s: would exceed total size limit", path)
return

stats.total_files += 1
Expand Down Expand Up @@ -232,15 +235,15 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool:

"""
if depth > MAX_DIRECTORY_DEPTH:
print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached")
logger.warning("Maximum depth limit (%i) reached", MAX_DIRECTORY_DEPTH)
return True

if stats.total_files >= MAX_FILES:
print(f"Maximum file limit ({MAX_FILES}) reached")
logger.warning("Maximum file limit (%i) reached", MAX_FILES)
return True # TODO: end recursion

if stats.total_size >= MAX_TOTAL_SIZE_BYTES:
print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached")
logger.warning("Maxumum total size limit (%.1fMB) reached", MAX_TOTAL_SIZE_BYTES / 1024 / 1024)
return True # TODO: end recursion

return False
29 changes: 29 additions & 0 deletions src/gitingest/logging_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Global logger configuration."""

import logging
import os
from typing import Literal

from pythonjsonlogger import jsonlogger


def setup_logging(level: Literal = logging.INFO) -> None:
"""Configure logger for the whole gitingest module.

Selects formatter based on LOG_FORMAT env variable:
- 'json': JSON formatter (time/level/msg, then extras)
- any other value or unset: default formatter
"""
logger = logging.getLogger()
logger.setLevel(level)
log_handler = logging.StreamHandler()

log_format = os.getenv("LOG_FORMAT", "default").lower()
if log_format == "json":
formatter = jsonlogger.JsonFormatter(
"%(asctime)s %(levelname)s %(message)s %(name)s %(module)s %(funcName)s %(lineno)d",
)
else:
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
log_handler.setFormatter(formatter)
logger.handlers = [log_handler]
9 changes: 6 additions & 3 deletions src/gitingest/output_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@

from __future__ import annotations

import ssl
import logging
import warnings
from ssl import SSLError
from typing import TYPE_CHECKING

import requests.exceptions
import tiktoken
from requests.exceptions import RequestException

from gitingest.schemas import FileSystemNode, FileSystemNodeType
from gitingest.utils.compat_func import readlink

if TYPE_CHECKING:
from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)

_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
(1_000_000, "M"),
(1_000, "k"),
Expand Down Expand Up @@ -195,7 +198,7 @@ def _format_token_count(text: str) -> str | None:
except (ValueError, UnicodeEncodeError) as exc:
warnings.warn(f"Failed to estimate token size: {exc}", RuntimeWarning, stacklevel=3)
return None
except (requests.exceptions.RequestException, ssl.SSLError) as exc:
except (RequestException, SSLError) as exc:
# If network errors, skip token count estimation instead of erroring out
warnings.warn(f"Failed to download tiktoken model: {exc}", RuntimeWarning, stacklevel=3)
return None
Expand Down
9 changes: 7 additions & 2 deletions src/gitingest/query_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from __future__ import annotations

import logging
import uuid
import warnings
from pathlib import Path
from typing import Literal

Expand All @@ -18,6 +18,8 @@
_normalise_source,
)

logger = logging.getLogger(__name__)


async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQuery:
"""Parse a repository URL and return an ``IngestionQuery`` object.
Expand Down Expand Up @@ -71,16 +73,19 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
# TODO: Handle issues and pull requests
if query.type in {PathKind.ISSUES, PathKind.PULL}:
msg = f"Warning: Issues and pull requests are not yet supported: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

# If no extra path parts, just return
if not path_parts:
msg = f"Warning: No extra path parts: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

if query.type not in {PathKind.TREE, PathKind.BLOB}:
# TODO: Handle other types
msg = f"Warning: Type '{query.type}' is not yet supported: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

# Commit, branch, or tag
Expand Down Expand Up @@ -169,7 +174,7 @@ async def _configure_branch_or_tag(
except RuntimeError as exc:
# If remote discovery fails, we optimistically treat the first path segment as the branch/tag.
msg = f"Warning: Failed to fetch {_ref_type}: {exc}"
warnings.warn(msg, RuntimeWarning, stacklevel=2)
logger.warning(msg)
return path_parts.pop(0) if path_parts else None

# Iterate over the path components and try to find a matching branch/tag
Expand Down
26 changes: 13 additions & 13 deletions src/gitingest/schemas/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import logging
from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)

Expand All @@ -10,6 +11,8 @@
from gitingest.config import MAX_FILE_SIZE
from gitingest.schemas.cloning import CloneConfig

logger = logging.getLogger(__name__)


class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
"""Pydantic model to store the parsed details of the repository or file path.
Expand Down Expand Up @@ -72,21 +75,18 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
s3_url: str | None = None

def extract_clone_config(self) -> CloneConfig:
"""Extract the relevant fields for the CloneConfig object.

Returns
-------
CloneConfig
A CloneConfig object containing the relevant fields.

Raises
------
ValueError
If the ``url`` parameter is not provided.

"""
"""Extract the relevant fields for the CloneConfig object."""
logger.debug(
"Extracting CloneConfig for url=%s, local_path=%s, branch=%s, tag=%s, commit=%s",
self.url,
self.local_path,
self.branch,
self.tag,
self.commit,
)
if not self.url:
msg = "The 'url' parameter is required."
logger.error(msg)
raise ValueError(msg)

return CloneConfig(
Expand Down
Loading
Loading