From c022cfba883706e8cc657dee4e57ff603c6d94ca Mon Sep 17 00:00:00 2001 From: mickael Date: Fri, 18 Jul 2025 14:11:40 +0200 Subject: [PATCH 1/3] feat: implement S3 integration for storing and retrieving digest files --- .docker/minio/setup.sh | 33 ++ .env.example | 23 ++ .pre-commit-config.yaml | 2 + README.md | 85 +++++ compose.yml | 110 ++++++ pyproject.toml | 1 + requirements.txt | 1 + src/gitingest/query_parser.py | 6 +- src/gitingest/schemas/ingestion.py | 8 +- src/server/models.py | 6 +- src/server/query_processor.py | 50 ++- src/server/routers/ingest.py | 34 +- src/server/s3_utils.py | 341 +++++++++++++++++++ src/static/js/utils.js | 14 +- tests/conftest.py | 3 +- tests/query_parser/test_git_host_agnostic.py | 2 +- tests/server/test_flow_integration.py | 12 +- 17 files changed, 693 insertions(+), 38 deletions(-) create mode 100755 .docker/minio/setup.sh create mode 100644 compose.yml create mode 100644 src/server/s3_utils.py diff --git a/.docker/minio/setup.sh b/.docker/minio/setup.sh new file mode 100755 index 00000000..3b1b6fb2 --- /dev/null +++ b/.docker/minio/setup.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +# Simple script to set up MinIO bucket and user +# Based on example from MinIO issues + +# Format bucket name to ensure compatibility +BUCKET_NAME=$(echo "${S3_BUCKET_NAME}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + +# Configure MinIO client +mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} + +# Remove bucket if it exists (for clean setup) +mc rm -r --force myminio/${BUCKET_NAME} || true + +# Create bucket +mc mb myminio/${BUCKET_NAME} + +# Set bucket policy to allow downloads +mc anonymous set download myminio/${BUCKET_NAME} + +# Create user with access and secret keys +mc admin user add myminio ${S3_ACCESS_KEY} ${S3_SECRET_KEY} || echo "User already exists" + +# Create policy for the bucket +echo '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*"],"Resource":["arn:aws:s3:::'${BUCKET_NAME}'/*","arn:aws:s3:::'${BUCKET_NAME}'"]}]}' > /tmp/policy.json + +# Apply policy +mc admin policy create myminio gitingest-policy /tmp/policy.json || echo "Policy already exists" +mc admin policy attach myminio gitingest-policy --user ${S3_ACCESS_KEY} + +echo "MinIO setup completed successfully" +echo "Bucket: ${BUCKET_NAME}" +echo "Access via console: http://localhost:9001" diff --git a/.env.example b/.env.example index 8d98ebba..aabdbf5a 100644 --- a/.env.example +++ b/.env.example @@ -33,3 +33,26 @@ GITINGEST_SENTRY_PROFILE_LIFECYCLE=trace GITINGEST_SENTRY_SEND_DEFAULT_PII=true # Environment name for Sentry (default: "") GITINGEST_SENTRY_ENVIRONMENT=development + +# MinIO Configuration (for development) +# Root user credentials for MinIO admin access +MINIO_ROOT_USER=minioadmin +MINIO_ROOT_PASSWORD=minioadmin + +# S3 Configuration (for application) +# Set to "true" to enable S3 storage for digests +# S3_ENABLED=true +# Endpoint URL for the S3 service (MinIO in development) +S3_ENDPOINT=http://minio:9000 +# Access key for the S3 bucket (created automatically in development) +S3_ACCESS_KEY=gitingest +# Secret key for the S3 bucket (created automatically in development) +S3_SECRET_KEY=gitingest123 +# Name of the S3 bucket (created automatically in development) +S3_BUCKET_NAME=gitingest-bucket +# Region for the S3 bucket (default for MinIO) +S3_REGION=us-east-1 +# Public URL/CDN for accessing S3 resources +S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket +# Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value) +# S3_DIRECTORY_PREFIX=my-prefix diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4aa5f0e1..529d352a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -113,6 +113,7 @@ repos: files: ^src/ additional_dependencies: [ + boto3>=1.28.0, click>=8.0.0, 'fastapi[standard]>=0.109.1', httpx, @@ -138,6 +139,7 @@ repos: - --rcfile=tests/.pylintrc additional_dependencies: [ + boto3>=1.28.0, click>=8.0.0, 'fastapi[standard]>=0.109.1', httpx, diff --git a/README.md b/README.md index 501753e2..a31c780a 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,8 @@ This is because Jupyter notebooks are asynchronous by default. ## 🐳 Self-host +### Using Docker + 1. Build the image: ``` bash @@ -239,6 +241,89 @@ The application can be configured using the following environment variables: - **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0) - **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace") - **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true") +- **S3_ALIAS_HOST**: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket") +- **S3_DIRECTORY_PREFIX**: Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value) + +### Using Docker Compose + +The project includes a `compose.yml` file that allows you to easily run the application in both development and production environments. + +#### Compose File Structure + +The `compose.yml` file uses YAML anchoring with `&app-base` and `<<: *app-base` to define common configuration that is shared between services: + +```yaml +# Common base configuration for all services +x-app-base: &app-base + build: + context: . + dockerfile: Dockerfile + ports: + - "${APP_WEB_BIND:-8000}:8000" # Main application port + - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port + # ... other common configurations +``` + +#### Services + +The file defines three services: + +1. **app**: Production service configuration + - Uses the `prod` profile + - Sets the Sentry environment to "production" + - Configured for stable operation with `restart: unless-stopped` + +2. **app-dev**: Development service configuration + - Uses the `dev` profile + - Enables debug mode + - Mounts the source code for live development + - Uses hot reloading for faster development + +3. **minio**: S3-compatible object storage for development + - Uses the `dev` profile (only available in development mode) + - Provides S3-compatible storage for local development + - Accessible via: + - API: Port 9000 ([localhost:9000](http://localhost:9000)) + - Web Console: Port 9001 ([localhost:9001](http://localhost:9001)) + - Default admin credentials: + - Username: `minioadmin` + - Password: `minioadmin` + - Configurable via environment variables: + - `MINIO_ROOT_USER`: Custom admin username (default: minioadmin) + - `MINIO_ROOT_PASSWORD`: Custom admin password (default: minioadmin) + - Includes persistent storage via Docker volume + - Auto-creates a bucket and application-specific credentials: + - Bucket name: `gitingest-bucket` (configurable via `S3_BUCKET_NAME`) + - Access key: `gitingest` (configurable via `S3_ACCESS_KEY`) + - Secret key: `gitingest123` (configurable via `S3_SECRET_KEY`) + - These credentials are automatically passed to the app-dev service via environment variables: + - `S3_ENDPOINT`: URL of the MinIO server + - `S3_ACCESS_KEY`: Access key for the S3 bucket + - `S3_SECRET_KEY`: Secret key for the S3 bucket + - `S3_BUCKET_NAME`: Name of the S3 bucket + - `S3_REGION`: Region for the S3 bucket (default: us-east-1) + - `S3_ALIAS_HOST`: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket") + +#### Usage Examples + +To run the application in development mode: + +```bash +docker compose --profile dev up +``` + +To run the application in production mode: + +```bash +docker compose --profile prod up -d +``` + +To build and run the application: + +```bash +docker compose --profile prod build +docker compose --profile prod up -d +``` ## 🤝 Contributing diff --git a/compose.yml b/compose.yml new file mode 100644 index 00000000..defe28cd --- /dev/null +++ b/compose.yml @@ -0,0 +1,110 @@ +# Common base configuration for all services +x-app-base: &app-base + ports: + - "${APP_WEB_BIND:-8000}:8000" # Main application port + - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port + environment: + # Python Configuration + - PYTHONUNBUFFERED=1 + - PYTHONDONTWRITEBYTECODE=1 + # Host Configuration + - ALLOWED_HOSTS=${ALLOWED_HOSTS:-gitingest.com,*.gitingest.com,localhost,127.0.0.1} + # Metrics Configuration + - GITINGEST_METRICS_ENABLED=${GITINGEST_METRICS_ENABLED:-true} + - GITINGEST_METRICS_HOST=${GITINGEST_METRICS_HOST:-127.0.0.1} + - GITINGEST_METRICS_PORT=${GITINGEST_METRICS_PORT:-9090} + # Sentry Configuration + - GITINGEST_SENTRY_ENABLED=${GITINGEST_SENTRY_ENABLED:-false} + - GITINGEST_SENTRY_DSN=${GITINGEST_SENTRY_DSN:-} + - GITINGEST_SENTRY_TRACES_SAMPLE_RATE=${GITINGEST_SENTRY_TRACES_SAMPLE_RATE:-1.0} + - GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE=${GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE:-1.0} + - GITINGEST_SENTRY_PROFILE_LIFECYCLE=${GITINGEST_SENTRY_PROFILE_LIFECYCLE:-trace} + - GITINGEST_SENTRY_SEND_DEFAULT_PII=${GITINGEST_SENTRY_SEND_DEFAULT_PII:-true} + user: "1000:1000" + command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"] + +services: + # Production service configuration + app: + <<: *app-base + image: ghcr.io/coderamp-labs/gitingest:latest + profiles: + - prod + environment: + - GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-production} + restart: unless-stopped + + # Development service configuration + app-dev: + <<: *app-base + build: + context: . + dockerfile: Dockerfile + profiles: + - dev + environment: + - DEBUG=true + - GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-development} + # S3 Configuration + - S3_ENABLED=true + - S3_ENDPOINT=http://minio:9000 + - S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest} + - S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123} + # Use lowercase bucket name to ensure compatibility with MinIO + - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket} + - S3_REGION=${S3_REGION:-us-east-1} + # Public URL for S3 resources + - S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}} + volumes: + # Mount source code for live development + - ./src:/app:ro + # Use --reload flag for hot reloading during development + command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] + depends_on: + minio-setup: + condition: service_completed_successfully + + # MinIO S3-compatible object storage for development + minio: + image: minio/minio:latest + profiles: + - dev + ports: + - "9000:9000" # API port + - "9001:9001" # Console port + environment: + - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin} + volumes: + - minio-data:/data + command: server /data --console-address ":9001" + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 30s + start_period: 30s + start_interval: 1s + + # MinIO setup service to create bucket and user + minio-setup: + image: minio/mc + profiles: + - dev + depends_on: + minio: + condition: service_healthy + environment: + - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin} + - S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest} + - S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123} + - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket} + volumes: + - ./.docker/minio/setup.sh:/setup.sh:ro + entrypoint: sh + command: -c /setup.sh + +volumes: + minio-data: + driver: local diff --git a/pyproject.toml b/pyproject.toml index 334140dc..0454b2d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dev = [ ] server = [ + "boto3>=1.28.0", # AWS SDK for S3 support "fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38) "prometheus-client", "sentry-sdk[fastapi]", diff --git a/requirements.txt b/requirements.txt index 712360e9..bdefb957 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +boto3>=1.28.0 # AWS SDK for S3 support click>=8.0.0 fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38 httpx diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 65b3f065..6262f0db 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -44,9 +44,9 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ host = parsed_url.netloc user, repo = _get_user_and_repo_from_path(parsed_url.path) - _id = str(uuid.uuid4()) + _id = uuid.uuid4() slug = f"{user}-{repo}" - local_path = TMP_BASE_PATH / _id / slug + local_path = TMP_BASE_PATH / str(_id) / slug url = f"https://{host}/{user}/{repo}" query = IngestionQuery( @@ -132,7 +132,7 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery: """ path_obj = Path(path_str).resolve() slug = path_obj.name if path_str == "." else path_str.strip("/") - return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4())) + return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4()) async def _configure_branch_or_tag( diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index 97e98804..92572aeb 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -3,6 +3,7 @@ from __future__ import annotations from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) +from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) from pydantic import BaseModel, Field @@ -27,7 +28,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes The URL of the repository. slug : str The slug of the repository. - id : str + id : UUID The ID of the repository. subpath : str The subpath to the repository or file (default: ``"/"``). @@ -47,6 +48,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes The patterns to include. include_submodules : bool Whether to include all Git submodules within the repository. (default: ``False``) + s3_url : str | None + The S3 URL where the digest is stored if S3 is enabled. """ @@ -56,7 +59,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes local_path: Path url: str | None = None slug: str - id: str + id: UUID subpath: str = Field(default="/") type: str | None = None branch: str | None = None @@ -66,6 +69,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes ignore_patterns: set[str] = Field(default_factory=set) # TODO: ssame type for ignore_* and include_* patterns include_patterns: set[str] | None = None include_submodules: bool = Field(default=False) + s3_url: str | None = None def extract_clone_config(self) -> CloneConfig: """Extract the relevant fields for the CloneConfig object. diff --git a/src/server/models.py b/src/server/models.py index 1ed95710..a1aed314 100644 --- a/src/server/models.py +++ b/src/server/models.py @@ -71,8 +71,8 @@ class IngestSuccessResponse(BaseModel): Short form of repository URL (user/repo). summary : str Summary of the ingestion process including token estimates. - ingest_id : str - Ingestion id used to download full context. + digest_url : str + URL to download the full digest content (either S3 URL or local download endpoint). tree : str File tree structure of the repository. content : str @@ -89,7 +89,7 @@ class IngestSuccessResponse(BaseModel): repo_url: str = Field(..., description="Original repository URL") short_repo_url: str = Field(..., description="Short repository URL (user/repo)") summary: str = Field(..., description="Ingestion summary with token estimates") - ingest_id: str = Field(..., description="Ingestion id used to download full context") + digest_url: str = Field(..., description="URL to download the full digest content") tree: str = Field(..., description="File tree structure") content: str = Field(..., description="Processed file content") default_max_file_size: int = Field(..., description="File size slider position used") diff --git a/src/server/query_processor.py b/src/server/query_processor.py index a7b60f61..88d7ff50 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -11,6 +11,7 @@ from gitingest.utils.git_utils import validate_github_token from gitingest.utils.pattern_utils import process_patterns from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType +from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3 from server.server_config import MAX_DISPLAY_SIZE from server.server_utils import Colors, log_slider_to_size @@ -45,6 +46,11 @@ async def process_query( IngestResponse A union type, corresponding to IngestErrorResponse or IngestSuccessResponse + Raises + ------ + RuntimeError + If the commit hash is not found (should never happen). + """ if token: validate_github_token(token) @@ -59,7 +65,6 @@ async def process_query( return IngestErrorResponse(error=str(exc)) query.url = cast("str", query.url) - query.host = cast("str", query.host) query.max_file_size = max_file_size query.ignore_patterns, query.include_patterns = process_patterns( exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None, @@ -71,13 +76,36 @@ async def process_query( short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "/" for the page title + # The commit hash should always be available at this point + if not query.commit: + msg = "Unexpected error: no commit hash found" + raise RuntimeError(msg) + try: summary, tree, content = ingest_query(query) - # TODO: why are we writing the tree and content to a file here? - local_txt_file = Path(clone_config.local_path).with_suffix(".txt") - with local_txt_file.open("w", encoding="utf-8") as f: - f.write(tree + "\n" + content) + # Prepare the digest content (tree + content) + digest_content = tree + "\n" + content + + # Store digest based on S3 configuration + if is_s3_enabled(): + # Upload to S3 instead of storing locally + s3_file_path = generate_s3_file_path( + source=query.url, + user_name=cast("str", query.user_name), + repo_name=cast("str", query.repo_name), + commit=query.commit, + include_patterns=query.include_patterns, + ignore_patterns=query.ignore_patterns, + ) + s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id) + # Store S3 URL in query for later use + query.s3_url = s3_url + else: + # Store locally + local_txt_file = Path(clone_config.local_path).with_suffix(".txt") + with local_txt_file.open("w", encoding="utf-8") as f: + f.write(digest_content) except Exception as exc: _print_error(query.url, exc, max_file_size, pattern_type, pattern) @@ -97,11 +125,21 @@ async def process_query( summary=summary, ) + # Generate digest_url based on S3 configuration + if is_s3_enabled(): + digest_url = getattr(query, "s3_url", None) + if not digest_url: + # This should not happen if S3 upload was successful + msg = "S3 is enabled but no S3 URL was generated" + raise RuntimeError(msg) + else: + digest_url = f"/api/download/file/{query.id}" + return IngestSuccessResponse( repo_url=input_text, short_repo_url=short_repo_url, summary=summary, - ingest_id=query.id, + digest_url=digest_url, tree=tree, content=content, default_max_file_size=slider_position, diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py index 521b7de0..42efefdf 100644 --- a/src/server/routers/ingest.py +++ b/src/server/routers/ingest.py @@ -1,12 +1,16 @@ """Ingest endpoint for the API.""" +from typing import Union +from uuid import UUID + from fastapi import APIRouter, HTTPException, Request, status -from fastapi.responses import FileResponse, JSONResponse +from fastapi.responses import FileResponse, JSONResponse, RedirectResponse from prometheus_client import Counter from gitingest.config import TMP_BASE_PATH from server.models import IngestRequest from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion +from server.s3_utils import is_s3_enabled from server.server_config import MAX_DISPLAY_SIZE from server.server_utils import limiter @@ -39,7 +43,7 @@ async def api_ingest( response = await _perform_ingestion( input_text=ingest_request.input_text, max_file_size=ingest_request.max_file_size, - pattern_type=ingest_request.pattern_type, + pattern_type=ingest_request.pattern_type.value, pattern=ingest_request.pattern, token=ingest_request.token, ) @@ -90,30 +94,42 @@ async def api_ingest_get( return response -@router.get("/api/download/file/{ingest_id}", response_class=FileResponse) -async def download_ingest(ingest_id: str) -> FileResponse: +@router.get("/api/download/file/{ingest_id}", response_model=None) +async def download_ingest( + ingest_id: UUID, +) -> Union[RedirectResponse, FileResponse]: # noqa: FA100 (future-rewritable-type-annotation) (pydantic) """Download the first text file produced for an ingest ID. **This endpoint retrieves the first ``*.txt`` file produced during the ingestion process** - and returns it as a downloadable file. The file is streamed with media type ``text/plain`` - and prompts the browser to download it. + and returns it as a downloadable file. When S3 is enabled, this endpoint is disabled + and clients should use the S3 URL provided in the ingest response instead. **Parameters** - - **ingest_id** (`str`): Identifier that the ingest step emitted + - **ingest_id** (`UUID`): Identifier that the ingest step emitted **Returns** - - **FileResponse**: Streamed response with media type ``text/plain`` + - **FileResponse**: Streamed response with media type ``text/plain`` for local files **Raises** + - **HTTPException**: **503** - endpoint is disabled when S3 is enabled - **HTTPException**: **404** - digest directory is missing or contains no ``*.txt`` file - **HTTPException**: **403** - the process lacks permission to read the directory or file """ + # Disable download endpoint when S3 is enabled + if is_s3_enabled(): + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Download endpoint is disabled when S3 is enabled. " + "Use the S3 URL provided in the ingest response instead.", + ) + + # Fall back to local file serving # Normalize and validate the directory path - directory = (TMP_BASE_PATH / ingest_id).resolve() + directory = (TMP_BASE_PATH / str(ingest_id)).resolve() if not str(directory).startswith(str(TMP_BASE_PATH.resolve())): raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}") diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py new file mode 100644 index 00000000..07ebdbe4 --- /dev/null +++ b/src/server/s3_utils.py @@ -0,0 +1,341 @@ +"""S3 utility functions for uploading and managing digest files.""" + +from __future__ import annotations + +import hashlib +import logging +import os +from typing import TYPE_CHECKING +from urllib.parse import urlparse +from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic) + +import boto3 +from botocore.exceptions import ClientError + +if TYPE_CHECKING: + from botocore.client import BaseClient + +# Initialize logger for this module +logger = logging.getLogger(__name__) + + +class S3UploadError(Exception): + """Custom exception for S3 upload failures.""" + + +def is_s3_enabled() -> bool: + """Check if S3 is enabled via environment variables.""" + return os.getenv("S3_ENABLED", "false").lower() == "true" + + +def get_s3_config() -> dict[str, str | None]: + """Get S3 configuration from environment variables.""" + config = { + "endpoint_url": os.getenv("S3_ENDPOINT"), + "aws_access_key_id": os.getenv("S3_ACCESS_KEY"), + "aws_secret_access_key": os.getenv("S3_SECRET_KEY"), + "region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"), + } + return {k: v for k, v in config.items() if v is not None} + + +def get_s3_bucket_name() -> str: + """Get S3 bucket name from environment variables.""" + return os.getenv("S3_BUCKET_NAME", "gitingest-bucket") + + +def get_s3_alias_host() -> str | None: + """Get S3 alias host for public URLs.""" + return os.getenv("S3_ALIAS_HOST") + + +def generate_s3_file_path( + source: str, + user_name: str, + repo_name: str, + commit: str, + include_patterns: set[str] | None, + ignore_patterns: set[str], +) -> str: + """Generate S3 file path with proper naming convention. + + The file path is formatted as: + [/]ingest//////.txt + + If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path. + The commit-ID is always included in the URL. + If no specific commit is provided, the actual commit hash from the cloned repository is used. + + Parameters + ---------- + source : str + Git host (e.g., github, gitlab, bitbucket, etc.). + user_name : str + Repository owner or user. + repo_name : str + Repository name. + commit : str + Commit hash. + include_patterns : set[str] | None + Set of patterns specifying which files to include. + ignore_patterns : set[str] + Set of patterns specifying which files to exclude. + + Returns + ------- + str + S3 file path string. + + Raises + ------ + ValueError + If the source URL is invalid. + + """ + hostname = urlparse(source).hostname + if hostname is None: + msg = "Invalid source URL" + logger.error(msg) + raise ValueError(msg) + + # Extract source from URL or default to "unknown" + git_source = { + "github.com": "github", + "gitlab.com": "gitlab", + "bitbucket.org": "bitbucket", + }.get(hostname, "unknown") + + # Create hash of exclude/include patterns for uniqueness + patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}" + patterns_str += f"exclude:{sorted(ignore_patterns)}" + patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16] + + # Build the base path + base_path = f"ingest/{git_source}/{user_name}/{repo_name}/{commit}/{patterns_hash}.txt" + + # Check for S3_DIRECTORY_PREFIX environment variable + s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX") + + if not s3_directory_prefix: + return base_path + + # Remove trailing slash if present and add the prefix + s3_directory_prefix = s3_directory_prefix.rstrip("/") + return f"{s3_directory_prefix}/{base_path}" + + +def create_s3_client() -> BaseClient: + """Create and return an S3 client with configuration from environment.""" + config = get_s3_config() + # Log S3 client creation (excluding sensitive info) + log_config = config.copy() + has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None)) + logger.debug( + msg="Creating S3 client", + extra={ + "s3_config": log_config, + "has_credentials": has_credentials, + }, + ) + return boto3.client("s3", **config) + + +def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str: + """Upload content to S3 and return the public URL. + + This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file. + The ingest ID is stored as an S3 object tag. + + Parameters + ---------- + content : str + The digest content to upload. + s3_file_path : str + The S3 file path where the content will be stored. + ingest_id : UUID + The ingest ID to store as an S3 object tag. + + Returns + ------- + str + Public URL to access the uploaded file. + + Raises + ------ + ValueError + If S3 is not enabled. + S3UploadError + If the upload to S3 fails. + + """ + if not is_s3_enabled(): + msg = "S3 is not enabled" + logger.error(msg) + raise ValueError(msg) + + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + extra_fields = { + "bucket_name": bucket_name, + "s3_file_path": s3_file_path, + "ingest_id": str(ingest_id), + "content_size": len(content), + } + + # Log upload attempt + logger.debug("Starting S3 upload", extra=extra_fields) + + try: + # Upload the content with ingest_id as tag + s3_client.put_object( + Bucket=bucket_name, + Key=s3_file_path, + Body=content.encode("utf-8"), + ContentType="text/plain", + Tagging=f"ingest_id={ingest_id!s}", + ) + except ClientError as err: + # Log upload failure + logger.exception( + "S3 upload failed", + extra={ + "bucket_name": bucket_name, + "s3_file_path": s3_file_path, + "ingest_id": str(ingest_id), + "error_code": err.response.get("Error", {}).get("Code"), + "error_message": str(err), + }, + ) + msg = f"Failed to upload to S3: {err}" + raise S3UploadError(msg) from err + + # Generate public URL + alias_host = get_s3_alias_host() + if alias_host: + # Use alias host if configured + public_url = f"{alias_host.rstrip('/')}/{s3_file_path}" + else: + # Fallback to direct S3 URL + endpoint = get_s3_config().get("endpoint_url") + if endpoint: + public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}" + else: + public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}" + + # Log successful upload + logger.debug( + "S3 upload completed successfully", + extra={ + "bucket_name": bucket_name, + "s3_file_path": s3_file_path, + "ingest_id": str(ingest_id), + "public_url": public_url, + }, + ) + + return public_url + + +def _build_s3_url(key: str) -> str: + """Build S3 URL for a given key.""" + alias_host = get_s3_alias_host() + if alias_host: + return f"{alias_host.rstrip('/')}/{key}" + + bucket_name = get_s3_bucket_name() + config = get_s3_config() + + endpoint = config["endpoint_url"] + if endpoint: + return f"{endpoint.rstrip('/')}/{bucket_name}/{key}" + + return f"https://{bucket_name}.s3.{config['region_name']}.amazonaws.com/{key}" + + +def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool: + """Check if an S3 object has the matching ingest_id tag.""" + try: + tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key) + tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])} + return tags.get("ingest_id") == str(target_ingest_id) + except ClientError: + return False + + +def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None: + """Get S3 URL for a given ingest ID if it exists. + + Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found. + Used by the download endpoint to redirect to S3 if available. + + Parameters + ---------- + ingest_id : UUID + The ingest ID to search for in S3 object tags. + + Returns + ------- + str | None + S3 URL if file exists, None otherwise. + + """ + if not is_s3_enabled(): + logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id) + return None + + logger.debug(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)}) + + try: + s3_client = create_s3_client() + bucket_name = get_s3_bucket_name() + + # List all objects in the ingest/ prefix and check their tags + paginator = s3_client.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/") + + objects_checked = 0 + for page in page_iterator: + if "Contents" not in page: + continue + + for obj in page["Contents"]: + key = obj["Key"] + objects_checked += 1 + if _check_object_tags( + s3_client=s3_client, + bucket_name=bucket_name, + key=key, + target_ingest_id=ingest_id, + ): + s3_url = _build_s3_url(key) + logger.debug( + msg="Found S3 object for ingest ID", + extra={ + "ingest_id": str(ingest_id), + "s3_key": key, + "s3_url": s3_url, + "objects_checked": objects_checked, + }, + ) + return s3_url + + logger.debug( + msg="No S3 object found for ingest ID", + extra={ + "ingest_id": str(ingest_id), + "objects_checked": objects_checked, + }, + ) + + except ClientError as err: + logger.exception( + msg="Error during S3 URL lookup", + extra={ + "ingest_id": str(ingest_id), + "error_code": err.response.get("Error", {}).get("Code"), + "error_message": str(err), + }, + ) + + return None diff --git a/src/static/js/utils.js b/src/static/js/utils.js index b20222be..6370036b 100644 --- a/src/static/js/utils.js +++ b/src/static/js/utils.js @@ -172,8 +172,8 @@ function handleSuccessfulResponse(data) { // Show results section showResults(); - // Store the ingest_id for download functionality - window.currentIngestId = data.ingest_id; + // Store the digest_url for download functionality + window.currentDigestUrl = data.digest_url; // Set plain text content for summary, tree, and content document.getElementById('result-summary').value = data.summary || ''; @@ -271,9 +271,9 @@ function copyFullDigest() { } function downloadFullDigest() { - // Check if we have an ingest_id - if (!window.currentIngestId) { - console.error('No ingest_id available for download'); + // Check if we have a digest_url + if (!window.currentDigestUrl) { + console.error('No digest_url available for download'); return; } @@ -289,10 +289,10 @@ function downloadFullDigest() { Downloading... `; - // Create a download link to the server endpoint + // Create a download link using the digest_url const a = document.createElement('a'); - a.href = `/api/download/file/${window.currentIngestId}`; + a.href = window.currentDigestUrl; a.download = 'digest.txt'; document.body.appendChild(a); a.click(); diff --git a/tests/conftest.py b/tests/conftest.py index 0e279726..fc97551f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,6 +8,7 @@ import json import sys +import uuid from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict from unittest.mock import AsyncMock @@ -62,7 +63,7 @@ def sample_query() -> IngestionQuery: repo_name="test_repo", local_path=Path("/tmp/test_repo").resolve(), slug="test_user/test_repo", - id="id", + id=uuid.uuid4(), branch="main", max_file_size=1_000_000, ignore_patterns={"*.pyc", "__pycache__", ".git"}, diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index 342d9882..ce95aa9b 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -55,7 +55,7 @@ async def test_parse_query_without_host( query = await parse_remote_repo(url) # Compare against the canonical dict while ignoring unpredictable fields. - actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"}) + actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns", "s3_url"}) assert "commit" in actual assert _is_valid_git_commit_hash(actual["commit"]) diff --git a/tests/server/test_flow_integration.py b/tests/server/test_flow_integration.py index 2129c0d9..31c474dd 100644 --- a/tests/server/test_flow_integration.py +++ b/tests/server/test_flow_integration.py @@ -50,7 +50,7 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "243", + "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", @@ -75,7 +75,7 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None: client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/nonexistent/repo", - "max_file_size": "243", + "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", @@ -97,7 +97,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None: # TODO: ingesting a large repo take too much time (eg: godotengine/godot repository) form_data = { "input_text": "https://github.com/octocat/hello-world", - "max_file_size": "10", + "max_file_size": 10, "pattern_type": "exclude", "pattern": "", "token": "", @@ -122,7 +122,7 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None: def make_request() -> None: form_data = { "input_text": "https://github.com/octocat/hello-world", - "max_file_size": "243", + "max_file_size": 243, "pattern_type": "exclude", "pattern": "", "token": "", @@ -149,7 +149,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None: client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "1", + "max_file_size": 1, "pattern_type": "exclude", "pattern": "", "token": "", @@ -172,7 +172,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "243", + "max_file_size": 243, "pattern_type": "include", "pattern": "*.md", "token": "", From b9a42bca9b782dccb831e648d3805e337deca816 Mon Sep 17 00:00:00 2001 From: mickael Date: Sat, 26 Jul 2025 15:47:46 +0200 Subject: [PATCH 2/3] feat: add S3_DIRECTORY_PREFIX env variable in compose file --- compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/compose.yml b/compose.yml index defe28cd..ac0afdbd 100644 --- a/compose.yml +++ b/compose.yml @@ -53,6 +53,7 @@ services: # Use lowercase bucket name to ensure compatibility with MinIO - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket} - S3_REGION=${S3_REGION:-us-east-1} + - S3_DIRECTORY_PREFIX=${S3_DIRECTORY_PREFIX:-dev} # Public URL for S3 resources - S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}} volumes: From 6a65cb7d54a3230f79f941f96414e6816ba575fe Mon Sep 17 00:00:00 2001 From: mickael Date: Sat, 26 Jul 2025 16:11:45 +0200 Subject: [PATCH 3/3] feat: update S3 file path format to include hostname and structured naming --- src/server/s3_utils.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py index 07ebdbe4..a30a957f 100644 --- a/src/server/s3_utils.py +++ b/src/server/s3_utils.py @@ -60,7 +60,8 @@ def generate_s3_file_path( """Generate S3 file path with proper naming convention. The file path is formatted as: - [/]ingest//////.txt + [/]ingest////// + /-.txt If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path. The commit-ID is always included in the URL. @@ -98,20 +99,13 @@ def generate_s3_file_path( logger.error(msg) raise ValueError(msg) - # Extract source from URL or default to "unknown" - git_source = { - "github.com": "github", - "gitlab.com": "gitlab", - "bitbucket.org": "bitbucket", - }.get(hostname, "unknown") - # Create hash of exclude/include patterns for uniqueness patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}" patterns_str += f"exclude:{sorted(ignore_patterns)}" patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16] - # Build the base path - base_path = f"ingest/{git_source}/{user_name}/{repo_name}/{commit}/{patterns_hash}.txt" + # Build the base path using hostname directly + base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt" # Check for S3_DIRECTORY_PREFIX environment variable s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")