Skip to content

feat: configure json logger for the whole gitingest module #451

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ repos:
pytest-asyncio,
pytest-mock,
python-dotenv,
python-json-logger,
'sentry-sdk[fastapi]',
slowapi,
starlette>=0.40.0,
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies = [
"pathspec>=0.12.1",
"pydantic",
"python-dotenv",
"python-json-logger",
"starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw)
"strenum; python_version < '3.11'",
"tiktoken>=0.7.0", # Support for o200k_base encoding
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pathspec>=0.12.1
prometheus-client
pydantic
python-dotenv
python-json-logger
sentry-sdk[fastapi]
slowapi
starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw
Expand Down
6 changes: 6 additions & 0 deletions src/gitingest/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
from __future__ import annotations

import asyncio
import logging
from typing import TypedDict

import click
from typing_extensions import Unpack

from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME
from gitingest.entrypoint import ingest_async
from gitingest.logging_config import setup_json_logging

setup_json_logging()

logger = logging.getLogger(__name__)


class _CLIArgs(TypedDict):
Expand Down
10 changes: 6 additions & 4 deletions src/gitingest/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import asyncio
import errno
import logging
import shutil
import stat
import sys
import warnings
from contextlib import asynccontextmanager
from pathlib import Path
from typing import TYPE_CHECKING, AsyncGenerator, Callable
Expand All @@ -28,6 +28,8 @@

from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)


async def ingest_async(
source: str,
Expand Down Expand Up @@ -209,19 +211,19 @@ def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str
"""
if tag and query.tag and tag != query.tag:
msg = f"Warning: The specified tag '{tag}' overrides the tag found in the URL '{query.tag}'."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

query.tag = tag or query.tag

if branch and query.branch and branch != query.branch:
msg = f"Warning: The specified branch '{branch}' overrides the branch found in the URL '{query.branch}'."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

query.branch = branch or query.branch

if tag and branch:
msg = "Warning: Both tag and branch are specified. The tag will be used."
warnings.warn(msg, RuntimeWarning, stacklevel=3)
logger.warning(msg)

# Tag wins over branch if both supplied
if query.tag:
Expand Down
17 changes: 10 additions & 7 deletions src/gitingest/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import logging
from pathlib import Path
from typing import TYPE_CHECKING

Expand All @@ -13,6 +14,8 @@
if TYPE_CHECKING:
from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)


def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
"""Run the ingestion process for a parsed query.
Expand Down Expand Up @@ -111,7 +114,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem
_process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
elif sub_path.is_file():
if sub_path.stat().st_size > query.max_file_size:
print(f"Skipping file {sub_path}: would exceed max file size limit")
logger.info("Skipping file %s: would exceed max file size limit", sub_path)
continue
_process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
elif sub_path.is_dir():
Expand All @@ -133,7 +136,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem
node.file_count += child_directory_node.file_count
node.dir_count += 1 + child_directory_node.dir_count
else:
print(f"Warning: {sub_path} is an unknown file type, skipping")
logger.warning("Warning: %s is an unknown file type, skipping", sub_path)

node.sort_children()

Expand Down Expand Up @@ -186,12 +189,12 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat

"""
if stats.total_files + 1 > MAX_FILES:
print(f"Maximum file limit ({MAX_FILES}) reached")
logger.warning("Maximum file limit (%i) reached", MAX_FILES)
return

file_size = path.stat().st_size
if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES:
print(f"Skipping file {path}: would exceed total size limit")
logger.info("Skipping file %s: would exceed total size limit", path)
return

stats.total_files += 1
Expand Down Expand Up @@ -232,15 +235,15 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool:

"""
if depth > MAX_DIRECTORY_DEPTH:
print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached")
logger.warning("Maximum depth limit (%i) reached", MAX_DIRECTORY_DEPTH)
return True

if stats.total_files >= MAX_FILES:
print(f"Maximum file limit ({MAX_FILES}) reached")
logger.warning("Maximum file limit (%i) reached", MAX_FILES)
return True # TODO: end recursion

if stats.total_size >= MAX_TOTAL_SIZE_BYTES:
print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached")
logger.warning("Maxumum total size limit (%.1fMB) reached", MAX_TOTAL_SIZE_BYTES / 1024 / 1024)
return True # TODO: end recursion

return False
16 changes: 16 additions & 0 deletions src/gitingest/logging_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Global logger configuration."""

import logging
from typing import Literal

from pythonjsonlogger import jsonlogger


def setup_json_logging(level: Literal = logging.INFO) -> None:
"""Configure json logger for the whole gitingest module."""
logger = logging.getLogger(__name__)
logger.setLevel(level)
log_handler = logging.StreamHandler()
formatter = jsonlogger.JsonFormatter("%(asctime)s %(levelname)s %(name)s %(message)s")
log_handler.setFormatter(formatter)
logger.handlers = [log_handler]
7 changes: 5 additions & 2 deletions src/gitingest/output_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import tiktoken
Expand All @@ -12,6 +13,8 @@
if TYPE_CHECKING:
from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)

_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
(1_000_000, "M"),
(1_000, "k"),
Expand Down Expand Up @@ -189,8 +192,8 @@ def _format_token_count(text: str) -> str | None:
try:
encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini
total_tokens = len(encoding.encode(text, disallowed_special=()))
except (ValueError, UnicodeEncodeError) as exc:
print(exc)
except (ValueError, UnicodeEncodeError):
logger.exception()
return None

for threshold, suffix in _TOKEN_THRESHOLDS:
Expand Down
9 changes: 7 additions & 2 deletions src/gitingest/query_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from __future__ import annotations

import logging
import uuid
import warnings
from pathlib import Path
from typing import Literal

Expand All @@ -18,6 +18,8 @@
_normalise_source,
)

logger = logging.getLogger(__name__)


async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQuery:
"""Parse a repository URL and return an ``IngestionQuery`` object.
Expand Down Expand Up @@ -71,16 +73,19 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
# TODO: Handle issues and pull requests
if query.type in {PathKind.ISSUES, PathKind.PULL}:
msg = f"Warning: Issues and pull requests are not yet supported: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

# If no extra path parts, just return
if not path_parts:
msg = f"Warning: No extra path parts: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

if query.type not in {PathKind.TREE, PathKind.BLOB}:
# TODO: Handle other types
msg = f"Warning: Type '{query.type}' is not yet supported: {url}. Returning repository root."
logger.warning(msg)
return await _fallback_to_root(query, token=token, warn_msg=msg)

# Commit, branch, or tag
Expand Down Expand Up @@ -169,7 +174,7 @@ async def _configure_branch_or_tag(
except RuntimeError as exc:
# If remote discovery fails, we optimistically treat the first path segment as the branch/tag.
msg = f"Warning: Failed to fetch {_ref_type}: {exc}"
warnings.warn(msg, RuntimeWarning, stacklevel=2)
logger.warning(msg)
return path_parts.pop(0) if path_parts else None

# Iterate over the path components and try to find a matching branch/tag
Expand Down
15 changes: 7 additions & 8 deletions src/gitingest/utils/git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import asyncio
import base64
import logging
import re
import sys
from pathlib import Path
Expand All @@ -15,11 +16,12 @@

from gitingest.utils.compat_func import removesuffix
from gitingest.utils.exceptions import InvalidGitHubTokenError
from server.server_utils import Colors

if TYPE_CHECKING:
from gitingest.schemas import CloneConfig

logger = logging.getLogger(__name__)

# GitHub Personal-Access tokens (classic + fine-grained).
# - ghp_ / gho_ / ghu_ / ghs_ / ghr_ → 36 alphanumerics
# - github_pat_ → 22 alphanumerics + "_" + 59 alphanumerics
Expand Down Expand Up @@ -97,13 +99,10 @@ async def ensure_git_installed() -> None:
try:
stdout, _ = await run_command("git", "config", "core.longpaths")
if stdout.decode().strip().lower() != "true":
print(
f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}Git clone may fail on Windows "
f"due to long file paths:{Colors.END}",
)
print(f"{Colors.RED}To avoid this issue, consider enabling long path support with:{Colors.END}")
print(f"{Colors.RED} git config --global core.longpaths true{Colors.END}")
print(f"{Colors.RED}Note: This command may require administrator privileges.{Colors.END}")
logger.warning("WARN: Git clone may fail on Windows due to long file paths:")
logger.warning("To avoid this issue, consider enabling long path support with:")
logger.warning(" git config --global core.longpaths true")
logger.warning("Note: This command may require administrator privileges.")
except RuntimeError:
# Ignore if checking 'core.longpaths' fails.
pass
Expand Down
12 changes: 5 additions & 7 deletions src/gitingest/utils/notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

import json
import warnings
import logging
from itertools import chain
from typing import TYPE_CHECKING, Any

Expand All @@ -12,6 +12,8 @@
if TYPE_CHECKING:
from pathlib import Path

logger = logging.getLogger(__name__)


def process_notebook(file: Path, *, include_output: bool = True) -> str:
"""Process a Jupyter notebook file and return an executable Python script as a string.
Expand Down Expand Up @@ -44,20 +46,16 @@ def process_notebook(file: Path, *, include_output: bool = True) -> str:
# Check if the notebook contains worksheets
worksheets = notebook.get("worksheets")
if worksheets:
warnings.warn(
logger.warning(
"Worksheets are deprecated as of IPEP-17. Consider updating the notebook. "
"(See: https://github.com/jupyter/nbformat and "
"https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets "
"for more information.)",
DeprecationWarning,
stacklevel=2,
)

if len(worksheets) > 1:
warnings.warn(
logger.warning(
"Multiple worksheets detected. Combining all worksheets into a single script.",
UserWarning,
stacklevel=2,
)

cells = list(chain.from_iterable(ws["cells"] for ws in worksheets))
Expand Down
5 changes: 3 additions & 2 deletions src/gitingest/utils/query_parser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from __future__ import annotations

import logging
import string
import warnings
from typing import TYPE_CHECKING, cast
from urllib.parse import ParseResult, unquote, urlparse

Expand All @@ -13,6 +13,7 @@
if TYPE_CHECKING:
from gitingest.schemas import IngestionQuery

logger = logging.getLogger(__name__)

HEX_DIGITS: set[str] = set(string.hexdigits)

Expand Down Expand Up @@ -56,7 +57,7 @@ async def _fallback_to_root(query: IngestionQuery, token: str | None, warn_msg:
url = cast("str", query.url)
query.commit = await _resolve_ref_to_sha(url, pattern="HEAD", token=token)
if warn_msg:
warnings.warn(warn_msg, RuntimeWarning, stacklevel=3)
logger.warning(warn_msg)
return query


Expand Down
Loading
Loading