diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4e3b4d86..da934083 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -58,12 +58,6 @@ repos: - id: python-use-type-annotations description: 'Enforce that python3.6+ type annotations are used instead of type comments.' - - repo: https://github.com/PyCQA/isort - rev: 6.0.1 - hooks: - - id: isort - description: 'Sort imports alphabetically, and automatically separated into sections and by type.' - - repo: https://github.com/pre-commit/mirrors-eslint rev: v9.30.1 hooks: diff --git a/README.md b/README.md index a31c780a..596098c9 100644 --- a/README.md +++ b/README.md @@ -144,12 +144,60 @@ By default, the digest is written to a text file (`digest.txt`) in your current - Use `--output/-o ` to write to a specific file. - Use `--output/-o -` to output directly to `STDOUT` (useful for piping to other tools). +### 🔧 Configure processing limits + +```bash +# Set higher limits for large repositories +gitingest https://github.com/torvalds/linux \ + --max-files 100000 \ + --max-total-size 2147483648 \ + --max-directory-depth 25 + +# Process only Python files up to 1MB each +gitingest /path/to/project \ + --include-pattern "*.py" \ + --max-size 1048576 \ + --max-files 1000 +``` + See more options and usage details with: ```bash gitingest --help ``` +### Configuration via Environment Variables + +You can configure various limits and settings using environment variables. All configuration environment variables start with the `GITINGEST_` prefix: + +#### File Processing Configuration + +- `GITINGEST_MAX_FILE_SIZE` - Maximum size of a single file to process *(default: 10485760 bytes, 10 MB)* +- `GITINGEST_MAX_FILES` - Maximum number of files to process *(default: 10000)* +- `GITINGEST_MAX_TOTAL_SIZE_BYTES` - Maximum size of output file *(default: 524288000 bytes, 500 MB)* +- `GITINGEST_MAX_DIRECTORY_DEPTH` - Maximum depth of directory traversal *(default: 20)* +- `GITINGEST_DEFAULT_TIMEOUT` - Default operation timeout in seconds *(default: 60)* +- `GITINGEST_OUTPUT_FILE_NAME` - Default output filename *(default: "digest.txt")* +- `GITINGEST_TMP_BASE_PATH` - Base path for temporary files *(default: system temp directory)* + +#### Server Configuration (for self-hosting) + +- `GITINGEST_MAX_DISPLAY_SIZE` - Maximum size of content to display in UI *(default: 300000 bytes)* +- `GITINGEST_DELETE_REPO_AFTER` - Repository cleanup timeout in seconds *(default: 3600, 1 hour)* +- `GITINGEST_MAX_FILE_SIZE_KB` - Maximum file size for UI slider in kB *(default: 102400, 100 MB)* +- `GITINGEST_MAX_SLIDER_POSITION` - Maximum slider position in UI *(default: 500)* + +#### Example usage + +```bash +# Configure for large scientific repositories +export GITINGEST_MAX_FILES=50000 +export GITINGEST_MAX_FILE_SIZE=20971520 # 20 MB +export GITINGEST_MAX_TOTAL_SIZE_BYTES=1073741824 # 1 GB + +gitingest https://github.com/some/large-repo +``` + ## 🐍 Python package usage ```python @@ -178,6 +226,15 @@ summary, tree, content = ingest("https://github.com/username/private-repo") # Include repository submodules summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True) + +# Configure limits programmatically +summary, tree, content = ingest( + "https://github.com/username/large-repo", + max_file_size=20 * 1024 * 1024, # 20 MB per file + max_files=50000, # 50k files max + max_total_size_bytes=1024**2, # 1 MB total + max_directory_depth=30 # 30 levels deep +) ``` By default, this won't write a file but can be enabled with the `output` argument. diff --git a/pyproject.toml b/pyproject.toml index ffbf6504..8a5870e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,14 +113,6 @@ case-sensitive = true [tool.pycln] all = true -# TODO: Remove this once we figure out how to use ruff-isort -[tool.isort] -profile = "black" -line_length = 119 -remove_redundant_aliases = true -float_to_top = true # https://github.com/astral-sh/ruff/issues/6514 -order_by_type = true -filter_files = true # Test configuration [tool.pytest.ini_options] diff --git a/src/gitingest/__main__.py b/src/gitingest/__main__.py index e14ed681..dd01a400 100644 --- a/src/gitingest/__main__.py +++ b/src/gitingest/__main__.py @@ -9,16 +9,20 @@ import click from typing_extensions import Unpack -from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_FILE_SIZE, MAX_TOTAL_SIZE_BYTES, OUTPUT_FILE_NAME from gitingest.entrypoint import ingest_async class _CLIArgs(TypedDict): source: str max_size: int + max_files: int + max_total_size: int + max_directory_depth: int exclude_pattern: tuple[str, ...] include_pattern: tuple[str, ...] branch: str | None + tag: str | None include_gitignored: bool include_submodules: bool token: str | None @@ -34,6 +38,24 @@ class _CLIArgs(TypedDict): show_default=True, help="Maximum file size to process in bytes", ) +@click.option( + "--max-files", + default=MAX_FILES, + show_default=True, + help="Maximum number of files to process", +) +@click.option( + "--max-total-size", + default=MAX_TOTAL_SIZE_BYTES, + show_default=True, + help="Maximum total size of all files in bytes", +) +@click.option( + "--max-directory-depth", + default=MAX_DIRECTORY_DEPTH, + show_default=True, + help="Maximum depth of directory traversal", +) @click.option("--exclude-pattern", "-e", multiple=True, help="Shell-style patterns to exclude.") @click.option( "--include-pattern", @@ -42,6 +64,7 @@ class _CLIArgs(TypedDict): help="Shell-style patterns to include.", ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") +@click.option("--tag", default=None, help="Tag to clone and ingest") @click.option( "--include-gitignored", is_flag=True, @@ -98,7 +121,7 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: $ gitingest --include-pattern "*.js" --exclude-pattern "node_modules/*" Private repositories: - $ gitingest https://github.com/user/private-repo -t ghp_token + $ gitingest https://github.com/user/private-repo --token ghp_token $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo Include submodules: @@ -112,9 +135,13 @@ async def _async_main( source: str, *, max_size: int = MAX_FILE_SIZE, + max_files: int = MAX_FILES, + max_total_size: int = MAX_TOTAL_SIZE_BYTES, + max_directory_depth: int = MAX_DIRECTORY_DEPTH, exclude_pattern: tuple[str, ...] | None = None, include_pattern: tuple[str, ...] | None = None, branch: str | None = None, + tag: str | None = None, include_gitignored: bool = False, include_submodules: bool = False, token: str | None = None, @@ -132,21 +159,29 @@ async def _async_main( A directory path or a Git repository URL. max_size : int Maximum file size in bytes to ingest (default: 10 MB). + max_files : int + Maximum number of files to ingest (default: 10,000). + max_total_size : int + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int + Maximum depth of directory traversal (default: 20). exclude_pattern : tuple[str, ...] | None Glob patterns for pruning the file set. include_pattern : tuple[str, ...] | None Glob patterns for including files in the output. branch : str | None - Git branch to ingest. If ``None``, the repository's default branch is used. + Git branch to clone and ingest (default: the default branch). + tag : str | None + Git tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool - If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``). + If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - The path where the output file will be written (default: ``digest.txt`` in current directory). + The path where the output file is written (default: ``digest.txt`` in current directory). Use ``"-"`` to write to ``stdout``. Raises @@ -170,9 +205,13 @@ async def _async_main( summary, _, _ = await ingest_async( source, max_file_size=max_size, - include_patterns=include_patterns, + max_files=max_files, + max_total_size_bytes=max_total_size, + max_directory_depth=max_directory_depth, exclude_patterns=exclude_patterns, + include_patterns=include_patterns, branch=branch, + tag=tag, include_gitignored=include_gitignored, include_submodules=include_submodules, token=token, diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 3d154684..3d7ff400 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -3,12 +3,14 @@ import tempfile from pathlib import Path -MAX_FILE_SIZE = 10 * 1024 * 1024 # Maximum size of a single file to process (10 MB) -MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10_000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # Maximum size of output file (500 MB) -DEFAULT_TIMEOUT = 60 # seconds +from gitingest.utils.config_utils import _get_int_env_var, _get_str_env_var -OUTPUT_FILE_NAME = "digest.txt" +MAX_FILE_SIZE = _get_int_env_var("MAX_FILE_SIZE", 10 * 1024 * 1024) # Max file size to process in bytes (10 MB) +MAX_FILES = _get_int_env_var("MAX_FILES", 10_000) # Max number of files to process +MAX_TOTAL_SIZE_BYTES = _get_int_env_var("MAX_TOTAL_SIZE_BYTES", 500 * 1024 * 1024) # Max output file size (500 MB) +MAX_DIRECTORY_DEPTH = _get_int_env_var("MAX_DIRECTORY_DEPTH", 20) # Max depth of directory traversal -TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest" +DEFAULT_TIMEOUT = _get_int_env_var("DEFAULT_TIMEOUT", 60) # Default timeout for git operations in seconds + +OUTPUT_FILE_NAME = _get_str_env_var("OUTPUT_FILE_NAME", "digest.txt") +TMP_BASE_PATH = Path(_get_str_env_var("TMP_BASE_PATH", tempfile.gettempdir())) / "gitingest" diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 321e1b3e..d57a3552 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -33,8 +33,11 @@ async def ingest_async( source: str, *, max_file_size: int = MAX_FILE_SIZE, - include_patterns: str | set[str] | None = None, + max_files: int | None = None, + max_total_size_bytes: int | None = None, + max_directory_depth: int | None = None, exclude_patterns: str | set[str] | None = None, + include_patterns: str | set[str] | None = None, branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, @@ -51,17 +54,23 @@ async def ingest_async( Parameters ---------- source : str - The source to analyze, which can be a URL (for a Git repository) or a local directory path. + A directory path or a Git repository URL. max_file_size : int - Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). - include_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to include. If ``None``, all files are included. + Maximum file size in bytes to ingest (default: 10 MB). + max_files : int | None + Maximum number of files to ingest (default: 10,000). + max_total_size_bytes : int | None + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int | None + Maximum depth of directory traversal (default: 20). exclude_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to exclude. If ``None``, no files are excluded. + Glob patterns for pruning the file set. + include_patterns : str | set[str] | None + Glob patterns for including files in the output. branch : str | None - The branch to clone and ingest (default: the default branch). + Git branch to clone and ingest (default: the default branch). tag : str | None - The tag to clone and ingest. If ``None``, no tag is used. + Git tag to to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool @@ -70,7 +79,7 @@ async def ingest_async( GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - File path where the summary and content should be written. + File path where the summary and content is written. If ``"-"`` (dash), the results are written to ``stdout``. If ``None``, the results are not written to a file. @@ -107,6 +116,13 @@ async def ingest_async( if query.url: _override_branch_and_tag(query, branch=branch, tag=tag) + if max_files is not None: + query.max_files = max_files + if max_total_size_bytes is not None: + query.max_total_size_bytes = max_total_size_bytes + if max_directory_depth is not None: + query.max_directory_depth = max_directory_depth + query.include_submodules = include_submodules async with _clone_repo_if_remote(query, token=token): @@ -121,8 +137,11 @@ def ingest( source: str, *, max_file_size: int = MAX_FILE_SIZE, - include_patterns: str | set[str] | None = None, + max_files: int | None = None, + max_total_size_bytes: int | None = None, + max_directory_depth: int | None = None, exclude_patterns: str | set[str] | None = None, + include_patterns: str | set[str] | None = None, branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, @@ -139,17 +158,23 @@ def ingest( Parameters ---------- source : str - The source to analyze, which can be a URL (for a Git repository) or a local directory path. + A directory path or a Git repository URL. max_file_size : int - Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB). - include_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to include. If ``None``, all files are included. + Maximum file size in bytes to ingest (default: 10 MB). + max_files : int | None + Maximum number of files to ingest (default: 10,000). + max_total_size_bytes : int | None + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int | None + Maximum depth of directory traversal (default: 20). exclude_patterns : str | set[str] | None - Pattern or set of patterns specifying which files to exclude. If ``None``, no files are excluded. + Glob patterns for pruning the file set. + include_patterns : str | set[str] | None + Glob patterns for including files in the output. branch : str | None - The branch to clone and ingest (default: the default branch). + Git branch to clone and ingest (default: the default branch). tag : str | None - The tag to clone and ingest. If ``None``, no tag is used. + Git tag to to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). include_submodules : bool @@ -158,7 +183,7 @@ def ingest( GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - File path where the summary and content should be written. + File path where the summary and content is written. If ``"-"`` (dash), the results are written to ``stdout``. If ``None``, the results are not written to a file. @@ -179,8 +204,11 @@ def ingest( ingest_async( source=source, max_file_size=max_file_size, - include_patterns=include_patterns, + max_files=max_files, + max_total_size_bytes=max_total_size_bytes, + max_directory_depth=max_directory_depth, exclude_patterns=exclude_patterns, + include_patterns=include_patterns, branch=branch, tag=tag, include_gitignored=include_gitignored, diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 489a41a4..86eed9b6 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import TYPE_CHECKING -from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.output_formatter import format_node from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include @@ -97,7 +96,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem Statistics tracking object for the total file count and size. """ - if limit_exceeded(stats, depth=node.depth): + if limit_exceeded(stats, depth=node.depth, query=query): return for sub_path in node.path.iterdir(): @@ -113,7 +112,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem if sub_path.stat().st_size > query.max_file_size: print(f"Skipping file {sub_path}: would exceed max file size limit") continue - _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) + _process_file(path=sub_path, parent_node=node, stats=stats, query=query) elif sub_path.is_dir(): child_directory_node = FileSystemNode( name=sub_path.name, @@ -167,7 +166,7 @@ def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemS parent_node.file_count += 1 -def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: +def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, query: IngestionQuery) -> None: """Process a file in the file system. This function checks the file's size, increments the statistics, and reads its content. @@ -181,17 +180,19 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat The dictionary to accumulate the results. stats : FileSystemStats Statistics tracking object for the total file count and size. - local_path : Path - The base path of the repository or directory being processed. + query : IngestionQuery + The query object containing the limit configurations. """ - if stats.total_files + 1 > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + if stats.total_files + 1 > query.max_files: + print(f"Maximum file limit ({query.max_files}) reached") return file_size = path.stat().st_size - if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {path}: would exceed total size limit") + if stats.total_size + file_size > query.max_total_size_bytes: + print( + f"Skipping file {path}: would exceed total size limit ({query.max_total_size_bytes / 1024 / 1024:.1f}MB)", + ) return stats.total_files += 1 @@ -202,7 +203,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat type=FileSystemNodeType.FILE, size=file_size, file_count=1, - path_str=str(path.relative_to(local_path)), + path_str=str(path.relative_to(query.local_path)), path=path, depth=parent_node.depth + 1, ) @@ -212,7 +213,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat parent_node.file_count += 1 -def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: +def limit_exceeded(stats: FileSystemStats, depth: int, query: IngestionQuery) -> bool: """Check if any of the traversal limits have been exceeded. This function checks if the current traversal has exceeded any of the configured limits: @@ -224,6 +225,8 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: Statistics tracking object for the total file count and size. depth : int The current depth of directory traversal. + query : IngestionQuery + The query object containing the limit configurations. Returns ------- @@ -231,16 +234,16 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: ``True`` if any limit has been exceeded, ``False`` otherwise. """ - if depth > MAX_DIRECTORY_DEPTH: - print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached") + if depth > query.max_directory_depth: + print(f"Maximum depth limit ({query.max_directory_depth}) reached") return True - if stats.total_files >= MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") + if stats.total_files >= query.max_files: + print(f"Maximum file limit ({query.max_files}) reached") return True # TODO: end recursion - if stats.total_size >= MAX_TOTAL_SIZE_BYTES: - print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached") + if stats.total_size >= query.max_total_size_bytes: + print(f"Maxumum total size limit ({query.max_total_size_bytes / 1024 / 1024:.1f}MB) reached") return True # TODO: end recursion return False diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index 21369075..9f5c48bf 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -7,7 +7,7 @@ from pydantic import BaseModel, Field -from gitingest.config import MAX_FILE_SIZE +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_FILE_SIZE, MAX_TOTAL_SIZE_BYTES from gitingest.schemas.cloning import CloneConfig @@ -19,33 +19,39 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes host : str | None The host of the repository. user_name : str | None - The username or owner of the repository. + Username or owner of the repository. repo_name : str | None - The name of the repository. + Name of the repository. local_path : Path - The local path to the repository or file. + Local path to the repository or file. url : str | None - The URL of the repository. + URL of the repository. slug : str The slug of the repository. id : UUID The ID of the repository. subpath : str - The subpath to the repository or file (default: ``"/"``). + Subpath to the repository or file (default: ``"/"``). type : str | None - The type of the repository or file. + Type of the repository or file. branch : str | None - The branch of the repository. + Branch of the repository. commit : str | None - The commit of the repository. - tag : str | None - The tag of the repository. + Commit of the repository. + tag: str | None + Tag of the repository. max_file_size : int - The maximum file size to ingest in bytes (default: 10 MB). + Maximum file size in bytes to ingest (default: 10 MB). + max_files : int + Maximum number of files to ingest (default: 10,000). + max_total_size_bytes : int + Maximum total size of output file in bytes (default: 500 MB). + max_directory_depth : int + Maximum depth of directory traversal (default: 20). ignore_patterns : set[str] - The patterns to ignore (default: ``set()``). + Patterns to ignore. include_patterns : set[str] | None - The patterns to include. + Patterns to include. include_submodules : bool Whether to include all Git submodules within the repository. (default: ``False``) s3_url : str | None @@ -66,6 +72,9 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes commit: str | None = None tag: str | None = None max_file_size: int = Field(default=MAX_FILE_SIZE) + max_files: int = Field(default=MAX_FILES) + max_total_size_bytes: int = Field(default=MAX_TOTAL_SIZE_BYTES) + max_directory_depth: int = Field(default=MAX_DIRECTORY_DEPTH) ignore_patterns: set[str] = Field(default_factory=set) # TODO: ssame type for ignore_* and include_* patterns include_patterns: set[str] | None = None include_submodules: bool = Field(default=False) diff --git a/src/gitingest/utils/colors.py b/src/gitingest/utils/colors.py new file mode 100644 index 00000000..c080c77b --- /dev/null +++ b/src/gitingest/utils/colors.py @@ -0,0 +1,30 @@ +"""Color printing utility.""" + + +class Colors: + """ANSI color codes.""" + + BLACK = "\033[0;30m" + RED = "\033[0;31m" + GREEN = "\033[0;32m" + BROWN = "\033[0;33m" + BLUE = "\033[0;34m" + PURPLE = "\033[0;35m" + CYAN = "\033[0;36m" + LIGHT_GRAY = "\033[0;37m" + DARK_GRAY = "\033[1;30m" + LIGHT_RED = "\033[1;31m" + LIGHT_GREEN = "\033[1;32m" + YELLOW = "\033[1;33m" + LIGHT_BLUE = "\033[1;34m" + LIGHT_PURPLE = "\033[1;35m" + LIGHT_CYAN = "\033[1;36m" + WHITE = "\033[1;37m" + BOLD = "\033[1m" + FAINT = "\033[2m" + ITALIC = "\033[3m" + UNDERLINE = "\033[4m" + BLINK = "\033[5m" + NEGATIVE = "\033[7m" + CROSSED = "\033[9m" + END = "\033[0m" diff --git a/src/gitingest/utils/config_utils.py b/src/gitingest/utils/config_utils.py new file mode 100644 index 00000000..626bda5a --- /dev/null +++ b/src/gitingest/utils/config_utils.py @@ -0,0 +1,53 @@ +"""Configuration utilities.""" + +from __future__ import annotations + +import os +import warnings + + +def _get_str_env_var(key: str, default: str) -> str: + """Get string environment variable with ``GITINGEST_`` prefix and fallback to default. + + Parameters + ---------- + key : str + The name of the environment variable. + default : str + The default value to return if the environment variable is not set. + + Returns + ------- + str + The value of the environment variable. + + """ + value = os.environ.get(f"GITINGEST_{key}") + + if value is None: + return default + + return value + + +def _get_int_env_var(key: str, default: int) -> int: + """Get integer environment variable with ``GITINGEST_`` prefix and fallback to default. + + Parameters + ---------- + key : str + The name of the environment variable. + default : int + The default value to return if the environment variable is not set. + + Returns + ------- + int + The value of the environment variable as an integer. + + """ + try: + return int(_get_str_env_var(key, default=str(default))) + except ValueError: + warnings.warn(f"Invalid value for GITINGEST_{key}. Using default: {default}", stacklevel=2) + return default diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index a094e944..05233d76 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -13,9 +13,9 @@ import httpx from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND +from gitingest.utils.colors import Colors from gitingest.utils.compat_func import removesuffix from gitingest.utils.exceptions import InvalidGitHubTokenError -from server.server_utils import Colors if TYPE_CHECKING: from gitingest.schemas import CloneConfig diff --git a/src/server/main.py b/src/server/main.py index 2a07773a..da2c760b 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -16,7 +16,7 @@ from server.metrics_server import start_metrics_server from server.routers import dynamic, index, ingest -from server.server_config import templates +from server.server_config import JINJA_TEMPLATES from server.server_utils import lifespan, limiter, rate_limit_exception_handler # Load environment variables from .env file @@ -164,7 +164,7 @@ async def custom_swagger_ui(request: Request) -> HTMLResponse: - **HTMLResponse**: Custom Swagger UI documentation page """ - return templates.TemplateResponse("swagger_ui.jinja", {"request": request}) + return JINJA_TEMPLATES.TemplateResponse("swagger_ui.jinja", {"request": request}) @app.get("/api", include_in_schema=True) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 172330ac..7df74585 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -8,11 +8,13 @@ from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query from gitingest.query_parser import parse_remote_repo +from gitingest.utils.colors import Colors from gitingest.utils.git_utils import validate_github_token from gitingest.utils.pattern_utils import process_patterns from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3 from server.server_config import MAX_DISPLAY_SIZE +from server.server_utils import log_slider_to_size from server.server_utils import Colors diff --git a/src/server/routers/dynamic.py b/src/server/routers/dynamic.py index 93b9d68b..62a26e7e 100644 --- a/src/server/routers/dynamic.py +++ b/src/server/routers/dynamic.py @@ -3,7 +3,7 @@ from fastapi import APIRouter, Request from fastapi.responses import HTMLResponse -from server.server_config import templates +from server.server_config import JINJA_TEMPLATES router = APIRouter() @@ -29,7 +29,7 @@ async def catch_all(request: Request, full_path: str) -> HTMLResponse: and other default parameters such as file size. """ - return templates.TemplateResponse( + return JINJA_TEMPLATES.TemplateResponse( "git.jinja", { "request": request, diff --git a/src/server/routers/index.py b/src/server/routers/index.py index af4abd51..1e12b857 100644 --- a/src/server/routers/index.py +++ b/src/server/routers/index.py @@ -3,7 +3,7 @@ from fastapi import APIRouter, Request from fastapi.responses import HTMLResponse -from server.server_config import EXAMPLE_REPOS, templates +from server.server_config import EXAMPLE_REPOS, JINJA_TEMPLATES router = APIRouter() @@ -27,7 +27,7 @@ async def home(request: Request) -> HTMLResponse: and other default parameters such as file size. """ - return templates.TemplateResponse( + return JINJA_TEMPLATES.TemplateResponse( "index.jinja", { "request": request, diff --git a/src/server/server_config.py b/src/server/server_config.py index d0b51c4d..04d5c0ef 100644 --- a/src/server/server_config.py +++ b/src/server/server_config.py @@ -6,12 +6,15 @@ from fastapi.templating import Jinja2Templates -MAX_DISPLAY_SIZE: int = 300_000 -DELETE_REPO_AFTER: int = 60 * 60 # In seconds (1 hour) +from gitingest.utils.config_utils import _get_int_env_var + +MAX_DISPLAY_SIZE: int = _get_int_env_var("MAX_DISPLAY_SIZE", 300_000) +DELETE_REPO_AFTER: int = _get_int_env_var("DELETE_REPO_AFTER", 60 * 60) # In seconds (1 hour) # Slider configuration (if updated, update the logSliderToSize function in src/static/js/utils.js) +MAX_FILE_SIZE_KB: int = _get_int_env_var("MAX_FILE_SIZE_KB", 100 * 1024) # 100 MB +MAX_SLIDER_POSITION: int = _get_int_env_var("MAX_SLIDER_POSITION", 500) # Maximum slider position DEFAULT_FILE_SIZE_KB: int = 5 * 1024 # 5 mb -MAX_FILE_SIZE_KB: int = 100 * 1024 # 100 mb EXAMPLE_REPOS: list[dict[str, str]] = [ {"name": "Gitingest", "url": "https://github.com/coderamp-labs/gitingest"}, @@ -21,7 +24,4 @@ {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, ] - -# Use absolute path to templates directory -templates_dir = Path(__file__).parent / "templates" -templates = Jinja2Templates(directory=templates_dir) +JINJA_TEMPLATES = Jinja2Templates(directory=Path(__file__).parent / "templates") diff --git a/src/server/server_utils.py b/src/server/server_utils.py index ee6f9eca..812a4a16 100644 --- a/src/server/server_utils.py +++ b/src/server/server_utils.py @@ -160,6 +160,23 @@ def _append_line(path: Path, line: str) -> None: fp.write(f"{line}\n") +def log_slider_to_size(position: int) -> int: + """Convert a slider position to a file size in bytes using a logarithmic scale. + + Parameters + ---------- + position : int + Slider position ranging from 0 to 500. + + Returns + ------- + int + File size in bytes corresponding to the slider position. + + """ + maxv = math.log(MAX_FILE_SIZE_KB) + return round(math.exp(maxv * pow(position / MAX_SLIDER_POSITION, 1.5))) * 1024 + ## Color printing utility class Colors: """ANSI color codes.""" diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index ce95aa9b..9a40512d 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -8,7 +8,7 @@ import pytest -from gitingest.config import MAX_FILE_SIZE +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_FILE_SIZE, MAX_TOTAL_SIZE_BYTES from gitingest.query_parser import parse_remote_repo from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS, _is_valid_git_commit_hash @@ -72,6 +72,9 @@ async def test_parse_query_without_host( "branch": None, "tag": None, "max_file_size": MAX_FILE_SIZE, + "max_directory_depth": MAX_DIRECTORY_DEPTH, + "max_files": MAX_FILES, + "max_total_size_bytes": MAX_TOTAL_SIZE_BYTES, "include_patterns": None, "include_submodules": False, } diff --git a/tests/server/test_flow_integration.py b/tests/server/test_flow_integration.py index 31c474dd..ef647819 100644 --- a/tests/server/test_flow_integration.py +++ b/tests/server/test_flow_integration.py @@ -12,9 +12,6 @@ from src.server.main import app -BASE_DIR = Path(__file__).resolve().parent.parent -TEMPLATE_DIR = BASE_DIR / "src" / "templates" - @pytest.fixture(scope="module") def test_client() -> Generator[TestClient, None, None]: