From d963fb7a0a6c76631394670f3a6d8972c7ee60eb Mon Sep 17 00:00:00 2001 From: ix-56h Date: Fri, 25 Jul 2025 17:27:02 +0200 Subject: [PATCH 1/3] Configure json logger for the whole gitingest module --- .pre-commit-config.yaml | 1 + pyproject.toml | 1 + requirements.txt | 1 + src/gitingest/__main__.py | 6 +++++ src/gitingest/entrypoint.py | 10 +++++--- src/gitingest/ingestion.py | 17 ++++++++----- src/gitingest/logging_config.py | 16 ++++++++++++ src/gitingest/output_formatter.py | 7 +++-- src/gitingest/query_parser.py | 9 +++++-- src/gitingest/utils/git_utils.py | 15 +++++------ src/gitingest/utils/notebook.py | 12 ++++----- src/gitingest/utils/query_parser_utils.py | 5 ++-- src/server/query_processor.py | 31 ++++++++++------------- src/server/server_utils.py | 19 ++++++++------ 14 files changed, 92 insertions(+), 58 deletions(-) create mode 100644 src/gitingest/logging_config.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4aa5f0e1..d28b174c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -122,6 +122,7 @@ repos: pytest-asyncio, pytest-mock, python-dotenv, + python-json-logger, 'sentry-sdk[fastapi]', slowapi, starlette>=0.40.0, diff --git a/pyproject.toml b/pyproject.toml index 334140dc..2a6f1c81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "pathspec>=0.12.1", "pydantic", "python-dotenv", + "python-json-logger", "starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw) "strenum; python_version < '3.11'", "tiktoken>=0.7.0", # Support for o200k_base encoding diff --git a/requirements.txt b/requirements.txt index 712360e9..c12fd5bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ pathspec>=0.12.1 prometheus-client pydantic python-dotenv +python-json-logger sentry-sdk[fastapi] slowapi starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw diff --git a/src/gitingest/__main__.py b/src/gitingest/__main__.py index e14ed681..b4cec415 100644 --- a/src/gitingest/__main__.py +++ b/src/gitingest/__main__.py @@ -4,6 +4,7 @@ from __future__ import annotations import asyncio +import logging from typing import TypedDict import click @@ -11,6 +12,11 @@ from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME from gitingest.entrypoint import ingest_async +from gitingest.logging_config import setup_json_logging + +setup_json_logging() + +logger = logging.getLogger(__name__) class _CLIArgs(TypedDict): diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 321e1b3e..bf3710f7 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -4,10 +4,10 @@ import asyncio import errno +import logging import shutil import stat import sys -import warnings from contextlib import asynccontextmanager from pathlib import Path from typing import TYPE_CHECKING, AsyncGenerator, Callable @@ -28,6 +28,8 @@ from gitingest.schemas import IngestionQuery +logger = logging.getLogger(__name__) + async def ingest_async( source: str, @@ -209,19 +211,19 @@ def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str """ if tag and query.tag and tag != query.tag: msg = f"Warning: The specified tag '{tag}' overrides the tag found in the URL '{query.tag}'." - warnings.warn(msg, RuntimeWarning, stacklevel=3) + logger.warning(msg) query.tag = tag or query.tag if branch and query.branch and branch != query.branch: msg = f"Warning: The specified branch '{branch}' overrides the branch found in the URL '{query.branch}'." - warnings.warn(msg, RuntimeWarning, stacklevel=3) + logger.warning(msg) query.branch = branch or query.branch if tag and branch: msg = "Warning: Both tag and branch are specified. The tag will be used." - warnings.warn(msg, RuntimeWarning, stacklevel=3) + logger.warning(msg) # Tag wins over branch if both supplied if query.tag: diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 489a41a4..6d460960 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging from pathlib import Path from typing import TYPE_CHECKING @@ -13,6 +14,8 @@ if TYPE_CHECKING: from gitingest.schemas import IngestionQuery +logger = logging.getLogger(__name__) + def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: """Run the ingestion process for a parsed query. @@ -111,7 +114,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem _process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_file(): if sub_path.stat().st_size > query.max_file_size: - print(f"Skipping file {sub_path}: would exceed max file size limit") + logger.info("Skipping file %s: would exceed max file size limit", sub_path) continue _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): @@ -133,7 +136,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem node.file_count += child_directory_node.file_count node.dir_count += 1 + child_directory_node.dir_count else: - print(f"Warning: {sub_path} is an unknown file type, skipping") + logger.warning("Warning: %s is an unknown file type, skipping", sub_path) node.sort_children() @@ -186,12 +189,12 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat """ if stats.total_files + 1 > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + logger.warning("Maximum file limit (%i) reached", MAX_FILES) return file_size = path.stat().st_size if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {path}: would exceed total size limit") + logger.info("Skipping file %s: would exceed total size limit", path) return stats.total_files += 1 @@ -232,15 +235,15 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: """ if depth > MAX_DIRECTORY_DEPTH: - print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached") + logger.warning("Maximum depth limit (%i) reached", MAX_DIRECTORY_DEPTH) return True if stats.total_files >= MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + logger.warning("Maximum file limit (%i) reached", MAX_FILES) return True # TODO: end recursion if stats.total_size >= MAX_TOTAL_SIZE_BYTES: - print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached") + logger.warning("Maxumum total size limit (%.1fMB) reached", MAX_TOTAL_SIZE_BYTES / 1024 / 1024) return True # TODO: end recursion return False diff --git a/src/gitingest/logging_config.py b/src/gitingest/logging_config.py new file mode 100644 index 00000000..9e6a2eae --- /dev/null +++ b/src/gitingest/logging_config.py @@ -0,0 +1,16 @@ +"""Global logger configuration.""" + +import logging +from typing import Literal + +from pythonjsonlogger import jsonlogger + + +def setup_json_logging(level: Literal = logging.INFO) -> None: + """Configure json logger for the whole gitingest module.""" + logger = logging.getLogger(__name__) + logger.setLevel(level) + log_handler = logging.StreamHandler() + formatter = jsonlogger.JsonFormatter("%(asctime)s %(levelname)s %(name)s %(message)s") + log_handler.setFormatter(formatter) + logger.handlers = [log_handler] diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 8a5b4135..364aaec9 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging from typing import TYPE_CHECKING import tiktoken @@ -12,6 +13,8 @@ if TYPE_CHECKING: from gitingest.schemas import IngestionQuery +logger = logging.getLogger(__name__) + _TOKEN_THRESHOLDS: list[tuple[int, str]] = [ (1_000_000, "M"), (1_000, "k"), @@ -189,8 +192,8 @@ def _format_token_count(text: str) -> str | None: try: encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini total_tokens = len(encoding.encode(text, disallowed_special=())) - except (ValueError, UnicodeEncodeError) as exc: - print(exc) + except (ValueError, UnicodeEncodeError): + logger.exception() return None for threshold, suffix in _TOKEN_THRESHOLDS: diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 65b3f065..0ca2bdb7 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -2,8 +2,8 @@ from __future__ import annotations +import logging import uuid -import warnings from pathlib import Path from typing import Literal @@ -18,6 +18,8 @@ _normalise_source, ) +logger = logging.getLogger(__name__) + async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQuery: """Parse a repository URL and return an ``IngestionQuery`` object. @@ -71,16 +73,19 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ # TODO: Handle issues and pull requests if query.type in {PathKind.ISSUES, PathKind.PULL}: msg = f"Warning: Issues and pull requests are not yet supported: {url}. Returning repository root." + logger.warning(msg) return await _fallback_to_root(query, token=token, warn_msg=msg) # If no extra path parts, just return if not path_parts: msg = f"Warning: No extra path parts: {url}. Returning repository root." + logger.warning(msg) return await _fallback_to_root(query, token=token, warn_msg=msg) if query.type not in {PathKind.TREE, PathKind.BLOB}: # TODO: Handle other types msg = f"Warning: Type '{query.type}' is not yet supported: {url}. Returning repository root." + logger.warning(msg) return await _fallback_to_root(query, token=token, warn_msg=msg) # Commit, branch, or tag @@ -169,7 +174,7 @@ async def _configure_branch_or_tag( except RuntimeError as exc: # If remote discovery fails, we optimistically treat the first path segment as the branch/tag. msg = f"Warning: Failed to fetch {_ref_type}: {exc}" - warnings.warn(msg, RuntimeWarning, stacklevel=2) + logger.warning(msg) return path_parts.pop(0) if path_parts else None # Iterate over the path components and try to find a matching branch/tag diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index a094e944..ba70e78e 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -4,6 +4,7 @@ import asyncio import base64 +import logging import re import sys from pathlib import Path @@ -15,11 +16,12 @@ from gitingest.utils.compat_func import removesuffix from gitingest.utils.exceptions import InvalidGitHubTokenError -from server.server_utils import Colors if TYPE_CHECKING: from gitingest.schemas import CloneConfig +logger = logging.getLogger(__name__) + # GitHub Personal-Access tokens (classic + fine-grained). # - ghp_ / gho_ / ghu_ / ghs_ / ghr_ → 36 alphanumerics # - github_pat_ → 22 alphanumerics + "_" + 59 alphanumerics @@ -97,13 +99,10 @@ async def ensure_git_installed() -> None: try: stdout, _ = await run_command("git", "config", "core.longpaths") if stdout.decode().strip().lower() != "true": - print( - f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}Git clone may fail on Windows " - f"due to long file paths:{Colors.END}", - ) - print(f"{Colors.RED}To avoid this issue, consider enabling long path support with:{Colors.END}") - print(f"{Colors.RED} git config --global core.longpaths true{Colors.END}") - print(f"{Colors.RED}Note: This command may require administrator privileges.{Colors.END}") + logger.warning("WARN: Git clone may fail on Windows due to long file paths:") + logger.warning("To avoid this issue, consider enabling long path support with:") + logger.warning(" git config --global core.longpaths true") + logger.warning("Note: This command may require administrator privileges.") except RuntimeError: # Ignore if checking 'core.longpaths' fails. pass diff --git a/src/gitingest/utils/notebook.py b/src/gitingest/utils/notebook.py index cfa09238..667e0812 100644 --- a/src/gitingest/utils/notebook.py +++ b/src/gitingest/utils/notebook.py @@ -3,7 +3,7 @@ from __future__ import annotations import json -import warnings +import logging from itertools import chain from typing import TYPE_CHECKING, Any @@ -12,6 +12,8 @@ if TYPE_CHECKING: from pathlib import Path +logger = logging.getLogger(__name__) + def process_notebook(file: Path, *, include_output: bool = True) -> str: """Process a Jupyter notebook file and return an executable Python script as a string. @@ -44,20 +46,16 @@ def process_notebook(file: Path, *, include_output: bool = True) -> str: # Check if the notebook contains worksheets worksheets = notebook.get("worksheets") if worksheets: - warnings.warn( + logger.warning( "Worksheets are deprecated as of IPEP-17. Consider updating the notebook. " "(See: https://github.com/jupyter/nbformat and " "https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets " "for more information.)", - DeprecationWarning, - stacklevel=2, ) if len(worksheets) > 1: - warnings.warn( + logger.warning( "Multiple worksheets detected. Combining all worksheets into a single script.", - UserWarning, - stacklevel=2, ) cells = list(chain.from_iterable(ws["cells"] for ws in worksheets)) diff --git a/src/gitingest/utils/query_parser_utils.py b/src/gitingest/utils/query_parser_utils.py index 41dc7ada..80234bf2 100644 --- a/src/gitingest/utils/query_parser_utils.py +++ b/src/gitingest/utils/query_parser_utils.py @@ -2,8 +2,8 @@ from __future__ import annotations +import logging import string -import warnings from typing import TYPE_CHECKING, cast from urllib.parse import ParseResult, unquote, urlparse @@ -13,6 +13,7 @@ if TYPE_CHECKING: from gitingest.schemas import IngestionQuery +logger = logging.getLogger(__name__) HEX_DIGITS: set[str] = set(string.hexdigits) @@ -56,7 +57,7 @@ async def _fallback_to_root(query: IngestionQuery, token: str | None, warn_msg: url = cast("str", query.url) query.commit = await _resolve_ref_to_sha(url, pattern="HEAD", token=token) if warn_msg: - warnings.warn(warn_msg, RuntimeWarning, stacklevel=3) + logger.warning(warn_msg) return query diff --git a/src/server/query_processor.py b/src/server/query_processor.py index a7b60f61..28554f7f 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging from pathlib import Path from typing import cast @@ -12,7 +13,9 @@ from gitingest.utils.pattern_utils import process_patterns from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType from server.server_config import MAX_DISPLAY_SIZE -from server.server_utils import Colors, log_slider_to_size +from server.server_utils import log_slider_to_size + +logger = logging.getLogger(__name__) async def process_query( @@ -54,8 +57,7 @@ async def process_query( try: query = await parse_remote_repo(input_text, token=token) except Exception as exc: - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print(f"{Colors.RED}{exc}{Colors.END}") + logger.exception() return IngestErrorResponse(error=str(exc)) query.url = cast("str", query.url) @@ -80,7 +82,8 @@ async def process_query( f.write(tree + "\n" + content) except Exception as exc: - _print_error(query.url, exc, max_file_size, pattern_type, pattern) + logger.exception() + _print_error(query.url, max_file_size, pattern_type, pattern) return IngestErrorResponse(error=str(exc)) if len(content) > MAX_DISPLAY_SIZE: @@ -126,27 +129,22 @@ def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) """ default_max_file_kb = 50 - print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") + logger.info("%s", url) if int(max_file_size / 1024) != default_max_file_kb: - print( - f" | {Colors.YELLOW}Size: {int(max_file_size / 1024)}kB{Colors.END}", - end="", - ) + logger.info("Size: %ikB", int(max_file_size / 1024)) if pattern_type == "include" and pattern != "": - print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") + logger.info("Include %s", pattern) elif pattern_type == "exclude" and pattern != "": - print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + logger.info("Exclude %s", pattern) -def _print_error(url: str, exc: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: +def _print_error(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: """Print a formatted error message for debugging. Parameters ---------- url : str The URL associated with the query that caused the error. - exc : Exception - The exception raised during the query or process. max_file_size : int The maximum file size allowed for the query, in bytes. pattern_type : str @@ -155,9 +153,7 @@ def _print_error(url: str, exc: Exception, max_file_size: int, pattern_type: str The actual pattern string to include or exclude in the query. """ - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") _print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.RED}{exc}{Colors.END}") def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: @@ -178,6 +174,5 @@ def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str """ estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] - print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") _print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") + logger.info("%s", estimated_tokens) diff --git a/src/server/server_utils.py b/src/server/server_utils.py index b0371661..7f200d96 100644 --- a/src/server/server_utils.py +++ b/src/server/server_utils.py @@ -1,6 +1,7 @@ """Utility functions for the server.""" import asyncio +import logging import math import shutil import time @@ -20,6 +21,8 @@ # Initialize a rate limiter limiter = Limiter(key_func=get_remote_address) +logger = logging.getLogger(__name__) + async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: """Handle rate-limiting errors with a custom exception handler. @@ -104,8 +107,8 @@ async def _remove_old_repositories( await _process_folder(folder) - except (OSError, PermissionError) as exc: - print(f"Error in _remove_old_repositories: {exc}") + except (OSError, PermissionError): + logger.exception("Exception in _remove_old_repositories") await asyncio.sleep(scan_interval) @@ -134,16 +137,16 @@ async def _process_folder(folder: Path) -> None: owner, repo = filename.split("-", 1) repo_url = f"{owner}/{repo}" await loop.run_in_executor(None, _append_line, history_file, repo_url) - except (OSError, PermissionError) as exc: - print(f"Error logging repository URL for {folder}: {exc}") + except (OSError, PermissionError): + logger.exception("Exception raised while processing folder %s", folder) # Delete the cloned repo try: await loop.run_in_executor(None, shutil.rmtree, folder) - except PermissionError as exc: - print(f"No permission to delete {folder}: {exc}") - except OSError as exc: - print(f"Could not delete {folder}: {exc}") + except PermissionError: + logger.exception("No permission to delete %s", folder) + except OSError: + logger.exception("Could not delete %s", folder) def _append_line(path: Path, line: str) -> None: From b95fc8a45e94880cf73a5ad49d3d8422d015ecd9 Mon Sep 17 00:00:00 2001 From: ix-56h Date: Mon, 28 Jul 2025 15:58:16 +0200 Subject: [PATCH 2/3] Add LOG_FORMAT env, add logging for the backend --- .env.example | 2 + src/gitingest/__main__.py | 24 +++--- src/gitingest/entrypoint.py | 29 +++----- src/gitingest/ingestion.py | 4 +- src/gitingest/logging_config.py | 21 +++++- src/gitingest/output_formatter.py | 2 +- src/gitingest/schemas/ingestion.py | 26 +++---- src/gitingest/utils/git_utils.py | 10 ++- src/gitingest/utils/notebook.py | 21 +----- src/server/main.py | 4 + src/server/query_processor.py | 116 ++++++++++------------------- src/server/routers/ingest.py | 43 ++++++++++- 12 files changed, 152 insertions(+), 150 deletions(-) diff --git a/.env.example b/.env.example index 8d98ebba..84e0a5a9 100644 --- a/.env.example +++ b/.env.example @@ -33,3 +33,5 @@ GITINGEST_SENTRY_PROFILE_LIFECYCLE=trace GITINGEST_SENTRY_SEND_DEFAULT_PII=true # Environment name for Sentry (default: "") GITINGEST_SENTRY_ENVIRONMENT=development + +LOG_FORMAT=JSON diff --git a/src/gitingest/__main__.py b/src/gitingest/__main__.py index b4cec415..bbb1db19 100644 --- a/src/gitingest/__main__.py +++ b/src/gitingest/__main__.py @@ -12,9 +12,9 @@ from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME from gitingest.entrypoint import ingest_async -from gitingest.logging_config import setup_json_logging +from gitingest.logging_config import setup_logging -setup_json_logging() +setup_logging() logger = logging.getLogger(__name__) @@ -169,9 +169,9 @@ async def _async_main( output_target = output if output is not None else OUTPUT_FILE_NAME if output_target == "-": - click.echo("Analyzing source, preparing output for stdout...", err=True) + logger.debug("Analyzing source, preparing output for stdout...") else: - click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True) + logger.debug("Analyzing source, output will be written to '%s'...", output_target) summary, _, _ = await ingest_async( source, @@ -186,18 +186,18 @@ async def _async_main( ) except Exception as exc: # Convert any exception into Click.Abort so that exit status is non-zero - click.echo(f"Error: {exc}", err=True) + logger.exception("Ingest failed.", exc_info=exc) raise click.Abort from exc if output_target == "-": # stdout - click.echo("\n--- Summary ---", err=True) - click.echo(summary, err=True) - click.echo("--- End Summary ---", err=True) - click.echo("Analysis complete! Output sent to stdout.", err=True) + logger.info("--- Summary ---") + logger.info(summary) + logger.info("--- End Summary ---") + logger.info("Analysis complete! Output sent to stdout.") else: # file - click.echo(f"Analysis complete! Output written to: {output_target}") - click.echo("\nSummary:") - click.echo(summary) + logger.info("Analysis complete! Output written to: %s", output_target) + logger.info("Summary:") + logger.info(summary) if __name__ == "__main__": diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index bf3710f7..444c6969 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -302,22 +302,17 @@ def _handle_remove_readonly( async def _write_output(tree: str, content: str, target: str | None) -> None: - """Write combined output to ``target`` (``"-"`` ⇒ stdout). - - Parameters - ---------- - tree : str - The tree-like string representation of the file structure. - content : str - The content of the files in the repository or directory. - target : str | None - The path to the output file. If ``None``, the results are not written to a file. - - """ + """Write combined output to ``target`` (``"-"`` ⇒ stdout).""" data = f"{tree}\n{content}" loop = asyncio.get_running_loop() - if target == "-": - await loop.run_in_executor(None, sys.stdout.write, data) - await loop.run_in_executor(None, sys.stdout.flush) - elif target is not None: - await loop.run_in_executor(None, Path(target).write_text, data, "utf-8") + try: + if target == "-": + logger.debug("Writing output to stdout.") + await loop.run_in_executor(None, sys.stdout.write, data) + await loop.run_in_executor(None, sys.stdout.flush) + elif target is not None: + logger.debug("Writing output to file: %s", target) + await loop.run_in_executor(None, Path(target).write_text, data, "utf-8") + except Exception as exc: + logger.exception("Failed to write output to %s.", target, exc_info=exc) + raise diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 6d460960..a2fb19b6 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -114,7 +114,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem _process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_file(): if sub_path.stat().st_size > query.max_file_size: - logger.info("Skipping file %s: would exceed max file size limit", sub_path) + logger.debug("Skipping file %s: would exceed max file size limit", sub_path) continue _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): @@ -194,7 +194,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat file_size = path.stat().st_size if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: - logger.info("Skipping file %s: would exceed total size limit", path) + logger.debug("Skipping file %s: would exceed total size limit", path) return stats.total_files += 1 diff --git a/src/gitingest/logging_config.py b/src/gitingest/logging_config.py index 9e6a2eae..e1cde977 100644 --- a/src/gitingest/logging_config.py +++ b/src/gitingest/logging_config.py @@ -1,16 +1,29 @@ """Global logger configuration.""" import logging +import os from typing import Literal from pythonjsonlogger import jsonlogger -def setup_json_logging(level: Literal = logging.INFO) -> None: - """Configure json logger for the whole gitingest module.""" - logger = logging.getLogger(__name__) +def setup_logging(level: Literal = logging.INFO) -> None: + """Configure logger for the whole gitingest module. + + Selects formatter based on LOG_FORMAT env variable: + - 'json': JSON formatter (time/level/msg, then extras) + - any other value or unset: default formatter + """ + logger = logging.getLogger() logger.setLevel(level) log_handler = logging.StreamHandler() - formatter = jsonlogger.JsonFormatter("%(asctime)s %(levelname)s %(name)s %(message)s") + + log_format = os.getenv("LOG_FORMAT", "default").lower() + if log_format == "json": + formatter = jsonlogger.JsonFormatter( + "%(asctime)s %(levelname)s %(message)s %(name)s %(module)s %(funcName)s %(lineno)d", + ) + else: + formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") log_handler.setFormatter(formatter) logger.handlers = [log_handler] diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 364aaec9..b494ab22 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -193,7 +193,7 @@ def _format_token_count(text: str) -> str | None: encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini total_tokens = len(encoding.encode(text, disallowed_special=())) except (ValueError, UnicodeEncodeError): - logger.exception() + logger.exception("Failed to estimate token size.") return None for threshold, suffix in _TOKEN_THRESHOLDS: diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index 97e98804..dbbb68e0 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) from pydantic import BaseModel, Field @@ -9,6 +10,8 @@ from gitingest.config import MAX_FILE_SIZE from gitingest.schemas.cloning import CloneConfig +logger = logging.getLogger(__name__) + class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes """Pydantic model to store the parsed details of the repository or file path. @@ -68,21 +71,18 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes include_submodules: bool = Field(default=False) def extract_clone_config(self) -> CloneConfig: - """Extract the relevant fields for the CloneConfig object. - - Returns - ------- - CloneConfig - A CloneConfig object containing the relevant fields. - - Raises - ------ - ValueError - If the ``url`` parameter is not provided. - - """ + """Extract the relevant fields for the CloneConfig object.""" + logger.debug( + "Extracting CloneConfig for url=%s, local_path=%s, branch=%s, tag=%s, commit=%s", + self.url, + self.local_path, + self.branch, + self.tag, + self.commit, + ) if not self.url: msg = "The 'url' parameter is required." + logger.error(msg) raise ValueError(msg) return CloneConfig( diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index ba70e78e..c3d82173 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -99,10 +99,12 @@ async def ensure_git_installed() -> None: try: stdout, _ = await run_command("git", "config", "core.longpaths") if stdout.decode().strip().lower() != "true": - logger.warning("WARN: Git clone may fail on Windows due to long file paths:") - logger.warning("To avoid this issue, consider enabling long path support with:") - logger.warning(" git config --global core.longpaths true") - logger.warning("Note: This command may require administrator privileges.") + logger.warning( + """Git clone may fail on Windows due to long file paths: +To avoid this issue, consider enabling long path support with: + git config --global core.longpaths true +Note: This command may require administrator privileges.""", + ) except RuntimeError: # Ignore if checking 'core.longpaths' fails. pass diff --git a/src/gitingest/utils/notebook.py b/src/gitingest/utils/notebook.py index 667e0812..024b0480 100644 --- a/src/gitingest/utils/notebook.py +++ b/src/gitingest/utils/notebook.py @@ -41,6 +41,7 @@ def process_notebook(file: Path, *, include_output: bool = True) -> str: notebook: dict[str, Any] = json.load(f) except json.JSONDecodeError as exc: msg = f"Invalid JSON in notebook: {file}" + logger.exception(msg) raise InvalidNotebookError(msg) from exc # Check if the notebook contains worksheets @@ -125,24 +126,7 @@ def _process_cell(cell: dict[str, Any], *, include_output: bool) -> str | None: def _extract_output(output: dict[str, Any]) -> list[str]: - """Extract the output from a Jupyter notebook cell. - - Parameters - ---------- - output : dict[str, Any] - The output dictionary from a Jupyter notebook cell. - - Returns - ------- - list[str] - The output as a list of strings. - - Raises - ------ - ValueError - If an unknown output type is encountered. - - """ + """Extract the output from a Jupyter notebook cell.""" output_type = output["output_type"] if output_type == "stream": @@ -155,4 +139,5 @@ def _extract_output(output: dict[str, Any]) -> list[str]: return [f"Error: {output['ename']}: {output['evalue']}"] msg = f"Unknown output type: {output_type}" + logger.error(msg) raise ValueError(msg) diff --git a/src/server/main.py b/src/server/main.py index 2a07773a..30003c75 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -14,6 +14,7 @@ from slowapi.errors import RateLimitExceeded from starlette.middleware.trustedhost import TrustedHostMiddleware +from gitingest.logging_config import setup_logging from server.metrics_server import start_metrics_server from server.routers import dynamic, index, ingest from server.server_config import templates @@ -22,6 +23,9 @@ # Load environment variables from .env file load_dotenv() +# Setup logging based on LOG_FORMAT env variable +setup_logging() + # Initialize Sentry SDK if enabled if os.getenv("GITINGEST_SENTRY_ENABLED") is not None: sentry_dsn = os.getenv("GITINGEST_SENTRY_DSN") diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 28554f7f..d4f9019c 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -49,56 +49,85 @@ async def process_query( A union type, corresponding to IngestErrorResponse or IngestSuccessResponse """ + logger.debug( + "Processing query: input_text=%s, slider_position=%s, pattern_type=%s, pattern=%s", + input_text, + slider_position, + pattern_type, + pattern, + ) if token: + logger.debug("Validating GitHub token.") validate_github_token(token) max_file_size = log_slider_to_size(slider_position) + logger.debug("Calculated max_file_size: %s", max_file_size) try: + logger.debug("Parsing remote repo.") query = await parse_remote_repo(input_text, token=token) + logger.debug("Parsed query: url=%s, user=%s, repo=%s", query.url, query.user_name, query.repo_name) except Exception as exc: - logger.exception() + logger.exception("Failed to parse remote repo.") return IngestErrorResponse(error=str(exc)) query.url = cast("str", query.url) query.host = cast("str", query.host) query.max_file_size = max_file_size + logger.debug("Processing patterns: pattern_type=%s, pattern=%s", pattern_type, pattern) query.ignore_patterns, query.include_patterns = process_patterns( exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None, include_patterns=pattern if pattern_type == PatternType.INCLUDE else None, ) clone_config = query.extract_clone_config() + logger.debug("Cloning repo with config: %r", clone_config) await clone_repo(clone_config, token=token) short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "/" for the page title try: + logger.debug("Running ingest_query.") summary, tree, content = ingest_query(query) - + logger.debug("Ingest query complete. Writing tree and content to file.") # TODO: why are we writing the tree and content to a file here? local_txt_file = Path(clone_config.local_path).with_suffix(".txt") with local_txt_file.open("w", encoding="utf-8") as f: f.write(tree + "\n" + content) + logger.debug("Wrote output to %s", local_txt_file) except Exception as exc: - logger.exception() - _print_error(query.url, max_file_size, pattern_type, pattern) + logger.exception( + "Error processing query for URL %s (max_file_size=%s, pattern_type=%s, pattern=%s).", + query.url, + max_file_size, + pattern_type, + pattern, + exc_info=exc, + ) return IngestErrorResponse(error=str(exc)) if len(content) > MAX_DISPLAY_SIZE: + logger.info( + "Content cropped to %sk characters for display.", + int(MAX_DISPLAY_SIZE / 1_000), + ) # Important: user-facing truncation content = ( f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] ) - _print_success( - url=query.url, - max_file_size=max_file_size, - pattern_type=pattern_type, - pattern=pattern, - summary=summary, - ) + logger.info( + "Query processed successfully for URL %s (max_file_size=%s, pattern_type=%s, pattern=%s)", + query.url, + max_file_size, + pattern_type, + pattern, + ) # Important: successful query + estimated_tokens = None + if "Estimated tokens:" in summary: + estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] + logger.info("Estimated tokens: %s", estimated_tokens) # Important: token estimation return IngestSuccessResponse( repo_url=input_text, @@ -111,68 +140,3 @@ async def process_query( pattern_type=pattern_type, pattern=pattern, ) - - -def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: - """Print a formatted summary of the query details for debugging. - - Parameters - ---------- - url : str - The URL associated with the query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - - """ - default_max_file_kb = 50 - logger.info("%s", url) - if int(max_file_size / 1024) != default_max_file_kb: - logger.info("Size: %ikB", int(max_file_size / 1024)) - if pattern_type == "include" and pattern != "": - logger.info("Include %s", pattern) - elif pattern_type == "exclude" and pattern != "": - logger.info("Exclude %s", pattern) - - -def _print_error(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: - """Print a formatted error message for debugging. - - Parameters - ---------- - url : str - The URL associated with the query that caused the error. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - - """ - _print_query(url, max_file_size, pattern_type, pattern) - - -def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: - """Print a formatted success message for debugging. - - Parameters - ---------- - url : str - The URL associated with the successful query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - summary : str - A summary of the query result, including details like estimated tokens. - - """ - estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] - _print_query(url, max_file_size, pattern_type, pattern) - logger.info("%s", estimated_tokens) diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index 521b7de0..9b0d0001 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -1,5 +1,7 @@ """Ingest endpoint for the API.""" +import logging + from fastapi import APIRouter, HTTPException, Request, status from fastapi.responses import FileResponse, JSONResponse from prometheus_client import Counter @@ -11,6 +13,7 @@ from server.server_utils import limiter ingest_counter = Counter("gitingest_ingest_total", "Number of ingests", ["status", "url"]) +logger = logging.getLogger(__name__) router = APIRouter() @@ -36,6 +39,13 @@ async def api_ingest( - **JSONResponse**: Success response with ingestion results or error response with appropriate HTTP status code """ + logger.debug( + "POST /api/ingest called with input_text=%s, max_file_size=%s, pattern_type=%s, pattern=%s", + ingest_request.input_text, + ingest_request.max_file_size, + ingest_request.pattern_type, + ingest_request.pattern, + ) response = await _perform_ingestion( input_text=ingest_request.input_text, max_file_size=ingest_request.max_file_size, @@ -43,7 +53,12 @@ async def api_ingest( pattern=ingest_request.pattern, token=ingest_request.token, ) - # limit URL to 255 characters + logger.info( + "Ingest POST result: status_code=%s, url=%s", + response.status_code, + ingest_request.input_text[:255], + ) # Important event: ingestion result + ingest_counter.labels(status=response.status_code, url=ingest_request.input_text[:255]).inc() return response @@ -78,6 +93,16 @@ async def api_ingest_get( **Returns** - **JSONResponse**: Success response with ingestion results or error response with appropriate HTTP status code """ + logger.debug( + "GET /api/%s/%s called with user=%s, repository=%s, max_file_size=%s, pattern_type=%s, pattern=%s", + user, + repository, + user, + repository, + max_file_size, + pattern_type, + pattern, + ) response = await _perform_ingestion( input_text=f"{user}/{repository}", max_file_size=max_file_size, @@ -85,7 +110,13 @@ async def api_ingest_get( pattern=pattern, token=token or None, ) - # limit URL to 255 characters + logger.info( + "Ingest GET result: status_code=%s, url=%s/%s", + response.status_code, + user, + repository, + ) # Important event: ingestion result + ingest_counter.labels(status=response.status_code, url=f"{user}/{repository}"[:255]).inc() return response @@ -115,22 +146,28 @@ async def download_ingest(ingest_id: str) -> FileResponse: # Normalize and validate the directory path directory = (TMP_BASE_PATH / ingest_id).resolve() if not str(directory).startswith(str(TMP_BASE_PATH.resolve())): + logger.error("Invalid ingest ID: %s (directory traversal attempt)", ingest_id) raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}") if not directory.is_dir(): + logger.error("Digest %s not found (directory does not exist)", ingest_id) raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Digest {ingest_id!r} not found") try: first_txt_file = next(directory.glob("*.txt")) + logger.debug("Found .txt file for download: %s", first_txt_file) except StopIteration as exc: + logger.exception("No .txt file found for digest %s", ingest_id, exc_info=exc) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"No .txt file found for digest {ingest_id!r}", + detail=f"No .txt file found for digest {ingest_id}", ) from exc try: + logger.info("Returning FileResponse for %s", first_txt_file) # Important event: file download return FileResponse(path=first_txt_file, media_type="text/plain", filename=first_txt_file.name) except PermissionError as exc: + logger.exception("Permission denied for %s", first_txt_file, exc_info=exc) raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail=f"Permission denied for {first_txt_file}", From ea991d9e3d194ddf8bd4388289dd85fca0c1f2da Mon Sep 17 00:00:00 2001 From: ix-56h Date: Mon, 28 Jul 2025 17:55:53 +0200 Subject: [PATCH 3/3] fix tests --- src/gitingest/__main__.py | 15 ++++++--------- src/gitingest/utils/notebook.py | 9 +++++++-- tests/test_summary.py | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/gitingest/__main__.py b/src/gitingest/__main__.py index bbb1db19..8c417bcc 100644 --- a/src/gitingest/__main__.py +++ b/src/gitingest/__main__.py @@ -189,15 +189,12 @@ async def _async_main( logger.exception("Ingest failed.", exc_info=exc) raise click.Abort from exc - if output_target == "-": # stdout - logger.info("--- Summary ---") - logger.info(summary) - logger.info("--- End Summary ---") - logger.info("Analysis complete! Output sent to stdout.") - else: # file - logger.info("Analysis complete! Output written to: %s", output_target) - logger.info("Summary:") - logger.info(summary) + if output_target == "-": + click.echo(f"--- Summary ---\n{summary}\n--- End Summary ---", err=True) + click.echo("Analysis complete! Output sent to stdout.", err=True) + else: + click.echo(f"Analysis complete! Output written to: {output_target}") + click.echo(f"Summary:\n{summary}") if __name__ == "__main__": diff --git a/src/gitingest/utils/notebook.py b/src/gitingest/utils/notebook.py index 024b0480..96776966 100644 --- a/src/gitingest/utils/notebook.py +++ b/src/gitingest/utils/notebook.py @@ -4,6 +4,7 @@ import json import logging +import warnings from itertools import chain from typing import TYPE_CHECKING, Any @@ -47,16 +48,20 @@ def process_notebook(file: Path, *, include_output: bool = True) -> str: # Check if the notebook contains worksheets worksheets = notebook.get("worksheets") if worksheets: - logger.warning( + warnings.warn( "Worksheets are deprecated as of IPEP-17. Consider updating the notebook. " "(See: https://github.com/jupyter/nbformat and " "https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets " "for more information.)", + DeprecationWarning, + stacklevel=2, ) if len(worksheets) > 1: - logger.warning( + warnings.warn( "Multiple worksheets detected. Combining all worksheets into a single script.", + UserWarning, + stacklevel=2, ) cells = list(chain.from_iterable(ws["cells"] for ws in worksheets)) diff --git a/tests/test_summary.py b/tests/test_summary.py index ac32394a..9d9d25ed 100644 --- a/tests/test_summary.py +++ b/tests/test_summary.py @@ -55,7 +55,7 @@ def test_ingest_summary(path_type: str, path: str, ref_type: str, ref: str) -> N is_blob = path_type == "blob" expected_lines = _calculate_expected_lines(ref_type, is_main_branch=is_main_branch) expected_non_empty_lines = expected_lines - 1 - + print(f"https://github.com/{REPO}/{path_type}/{ref}{path}") summary, _, _ = ingest(f"https://github.com/{REPO}/{path_type}/{ref}{path}") lines = summary.splitlines() parsed_lines = dict(line.split(": ", 1) for line in lines if ": " in line)