From c022cfba883706e8cc657dee4e57ff603c6d94ca Mon Sep 17 00:00:00 2001
From: mickael <contact@mickael-caudrelier.fr>
Date: Fri, 18 Jul 2025 14:11:40 +0200
Subject: [PATCH 1/3] feat: implement S3 integration for storing and retrieving
 digest files

---
 .docker/minio/setup.sh                       |  33 ++
 .env.example                                 |  23 ++
 .pre-commit-config.yaml                      |   2 +
 README.md                                    |  85 +++++
 compose.yml                                  | 110 ++++++
 pyproject.toml                               |   1 +
 requirements.txt                             |   1 +
 src/gitingest/query_parser.py                |   6 +-
 src/gitingest/schemas/ingestion.py           |   8 +-
 src/server/models.py                         |   6 +-
 src/server/query_processor.py                |  50 ++-
 src/server/routers/ingest.py                 |  34 +-
 src/server/s3_utils.py                       | 341 +++++++++++++++++++
 src/static/js/utils.js                       |  14 +-
 tests/conftest.py                            |   3 +-
 tests/query_parser/test_git_host_agnostic.py |   2 +-
 tests/server/test_flow_integration.py        |  12 +-
 17 files changed, 693 insertions(+), 38 deletions(-)
 create mode 100755 .docker/minio/setup.sh
 create mode 100644 compose.yml
 create mode 100644 src/server/s3_utils.py

diff --git a/.docker/minio/setup.sh b/.docker/minio/setup.sh
new file mode 100755
index 00000000..3b1b6fb2
--- /dev/null
+++ b/.docker/minio/setup.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# Simple script to set up MinIO bucket and user
+# Based on example from MinIO issues
+
+# Format bucket name to ensure compatibility
+BUCKET_NAME=$(echo "${S3_BUCKET_NAME}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+# Configure MinIO client
+mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}
+
+# Remove bucket if it exists (for clean setup)
+mc rm -r --force myminio/${BUCKET_NAME} || true
+
+# Create bucket
+mc mb myminio/${BUCKET_NAME}
+
+# Set bucket policy to allow downloads
+mc anonymous set download myminio/${BUCKET_NAME}
+
+# Create user with access and secret keys
+mc admin user add myminio ${S3_ACCESS_KEY} ${S3_SECRET_KEY} || echo "User already exists"
+
+# Create policy for the bucket
+echo '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*"],"Resource":["arn:aws:s3:::'${BUCKET_NAME}'/*","arn:aws:s3:::'${BUCKET_NAME}'"]}]}' > /tmp/policy.json
+
+# Apply policy
+mc admin policy create myminio gitingest-policy /tmp/policy.json || echo "Policy already exists"
+mc admin policy attach myminio gitingest-policy --user ${S3_ACCESS_KEY}
+
+echo "MinIO setup completed successfully"
+echo "Bucket: ${BUCKET_NAME}"
+echo "Access via console: http://localhost:9001"
diff --git a/.env.example b/.env.example
index 8d98ebba..aabdbf5a 100644
--- a/.env.example
+++ b/.env.example
@@ -33,3 +33,26 @@ GITINGEST_SENTRY_PROFILE_LIFECYCLE=trace
 GITINGEST_SENTRY_SEND_DEFAULT_PII=true
 # Environment name for Sentry (default: "")
 GITINGEST_SENTRY_ENVIRONMENT=development
+
+# MinIO Configuration (for development)
+# Root user credentials for MinIO admin access
+MINIO_ROOT_USER=minioadmin
+MINIO_ROOT_PASSWORD=minioadmin
+
+# S3 Configuration (for application)
+# Set to "true" to enable S3 storage for digests
+# S3_ENABLED=true
+# Endpoint URL for the S3 service (MinIO in development)
+S3_ENDPOINT=http://minio:9000
+# Access key for the S3 bucket (created automatically in development)
+S3_ACCESS_KEY=gitingest
+# Secret key for the S3 bucket (created automatically in development)
+S3_SECRET_KEY=gitingest123
+# Name of the S3 bucket (created automatically in development)
+S3_BUCKET_NAME=gitingest-bucket
+# Region for the S3 bucket (default for MinIO)
+S3_REGION=us-east-1
+# Public URL/CDN for accessing S3 resources
+S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket
+# Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
+# S3_DIRECTORY_PREFIX=my-prefix
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4aa5f0e1..529d352a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -113,6 +113,7 @@ repos:
         files: ^src/
         additional_dependencies:
           [
+            boto3>=1.28.0,
             click>=8.0.0,
             'fastapi[standard]>=0.109.1',
             httpx,
@@ -138,6 +139,7 @@ repos:
           - --rcfile=tests/.pylintrc
         additional_dependencies:
           [
+            boto3>=1.28.0,
             click>=8.0.0,
             'fastapi[standard]>=0.109.1',
             httpx,
diff --git a/README.md b/README.md
index 501753e2..a31c780a 100644
--- a/README.md
+++ b/README.md
@@ -204,6 +204,8 @@ This is because Jupyter notebooks are asynchronous by default.
 
 ## 🐳 Self-host
 
+### Using Docker
+
 1. Build the image:
 
    ``` bash
@@ -239,6 +241,89 @@ The application can be configured using the following environment variables:
 - **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0)
 - **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace")
 - **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true")
+- **S3_ALIAS_HOST**: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
+- **S3_DIRECTORY_PREFIX**: Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
+
+### Using Docker Compose
+
+The project includes a `compose.yml` file that allows you to easily run the application in both development and production environments.
+
+#### Compose File Structure
+
+The `compose.yml` file uses YAML anchoring with `&app-base` and `<<: *app-base` to define common configuration that is shared between services:
+
+```yaml
+# Common base configuration for all services
+x-app-base: &app-base
+  build:
+    context: .
+    dockerfile: Dockerfile
+  ports:
+    - "${APP_WEB_BIND:-8000}:8000"  # Main application port
+    - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090"  # Metrics port
+  # ... other common configurations
+```
+
+#### Services
+
+The file defines three services:
+
+1. **app**: Production service configuration
+   - Uses the `prod` profile
+   - Sets the Sentry environment to "production"
+   - Configured for stable operation with `restart: unless-stopped`
+
+2. **app-dev**: Development service configuration
+   - Uses the `dev` profile
+   - Enables debug mode
+   - Mounts the source code for live development
+   - Uses hot reloading for faster development
+
+3. **minio**: S3-compatible object storage for development
+   - Uses the `dev` profile (only available in development mode)
+   - Provides S3-compatible storage for local development
+   - Accessible via:
+     - API: Port 9000 ([localhost:9000](http://localhost:9000))
+     - Web Console: Port 9001 ([localhost:9001](http://localhost:9001))
+   - Default admin credentials:
+     - Username: `minioadmin`
+     - Password: `minioadmin`
+   - Configurable via environment variables:
+     - `MINIO_ROOT_USER`: Custom admin username (default: minioadmin)
+     - `MINIO_ROOT_PASSWORD`: Custom admin password (default: minioadmin)
+   - Includes persistent storage via Docker volume
+   - Auto-creates a bucket and application-specific credentials:
+     - Bucket name: `gitingest-bucket` (configurable via `S3_BUCKET_NAME`)
+     - Access key: `gitingest` (configurable via `S3_ACCESS_KEY`)
+     - Secret key: `gitingest123` (configurable via `S3_SECRET_KEY`)
+   - These credentials are automatically passed to the app-dev service via environment variables:
+     - `S3_ENDPOINT`: URL of the MinIO server
+     - `S3_ACCESS_KEY`: Access key for the S3 bucket
+     - `S3_SECRET_KEY`: Secret key for the S3 bucket
+     - `S3_BUCKET_NAME`: Name of the S3 bucket
+     - `S3_REGION`: Region for the S3 bucket (default: us-east-1)
+     - `S3_ALIAS_HOST`: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
+
+#### Usage Examples
+
+To run the application in development mode:
+
+```bash
+docker compose --profile dev up
+```
+
+To run the application in production mode:
+
+```bash
+docker compose --profile prod up -d
+```
+
+To build and run the application:
+
+```bash
+docker compose --profile prod build
+docker compose --profile prod up -d
+```
 
 ## 🤝 Contributing
 
diff --git a/compose.yml b/compose.yml
new file mode 100644
index 00000000..defe28cd
--- /dev/null
+++ b/compose.yml
@@ -0,0 +1,110 @@
+# Common base configuration for all services
+x-app-base: &app-base
+  ports:
+    - "${APP_WEB_BIND:-8000}:8000"  # Main application port
+    - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090"  # Metrics port
+  environment:
+    # Python Configuration
+    - PYTHONUNBUFFERED=1
+    - PYTHONDONTWRITEBYTECODE=1
+    # Host Configuration
+    - ALLOWED_HOSTS=${ALLOWED_HOSTS:-gitingest.com,*.gitingest.com,localhost,127.0.0.1}
+    # Metrics Configuration
+    - GITINGEST_METRICS_ENABLED=${GITINGEST_METRICS_ENABLED:-true}
+    - GITINGEST_METRICS_HOST=${GITINGEST_METRICS_HOST:-127.0.0.1}
+    - GITINGEST_METRICS_PORT=${GITINGEST_METRICS_PORT:-9090}
+    # Sentry Configuration
+    - GITINGEST_SENTRY_ENABLED=${GITINGEST_SENTRY_ENABLED:-false}
+    - GITINGEST_SENTRY_DSN=${GITINGEST_SENTRY_DSN:-}
+    - GITINGEST_SENTRY_TRACES_SAMPLE_RATE=${GITINGEST_SENTRY_TRACES_SAMPLE_RATE:-1.0}
+    - GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE=${GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE:-1.0}
+    - GITINGEST_SENTRY_PROFILE_LIFECYCLE=${GITINGEST_SENTRY_PROFILE_LIFECYCLE:-trace}
+    - GITINGEST_SENTRY_SEND_DEFAULT_PII=${GITINGEST_SENTRY_SEND_DEFAULT_PII:-true}
+  user: "1000:1000"
+  command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"]
+
+services:
+  # Production service configuration
+  app:
+    <<: *app-base
+    image: ghcr.io/coderamp-labs/gitingest:latest
+    profiles:
+      - prod
+    environment:
+      - GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-production}
+    restart: unless-stopped
+
+  # Development service configuration
+  app-dev:
+    <<: *app-base
+    build:
+      context: .
+      dockerfile: Dockerfile
+    profiles:
+      - dev
+    environment:
+      - DEBUG=true
+      - GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-development}
+      # S3 Configuration
+      - S3_ENABLED=true
+      - S3_ENDPOINT=http://minio:9000
+      - S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
+      - S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
+      # Use lowercase bucket name to ensure compatibility with MinIO
+      - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
+      - S3_REGION=${S3_REGION:-us-east-1}
+      # Public URL for S3 resources
+      - S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}}
+    volumes:
+      # Mount source code for live development
+      - ./src:/app:ro
+    # Use --reload flag for hot reloading during development
+    command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
+    depends_on:
+      minio-setup:
+        condition: service_completed_successfully
+
+  # MinIO S3-compatible object storage for development
+  minio:
+    image: minio/minio:latest
+    profiles:
+      - dev
+    ports:
+      - "9000:9000"  # API port
+      - "9001:9001"  # Console port
+    environment:
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
+    volumes:
+      - minio-data:/data
+    command: server /data --console-address ":9001"
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 30s
+      start_period: 30s
+      start_interval: 1s
+
+  # MinIO setup service to create bucket and user
+  minio-setup:
+    image: minio/mc
+    profiles:
+      - dev
+    depends_on:
+      minio:
+        condition: service_healthy
+    environment:
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
+      - S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
+      - S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
+      - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
+    volumes:
+      - ./.docker/minio/setup.sh:/setup.sh:ro
+    entrypoint: sh
+    command: -c /setup.sh
+
+volumes:
+  minio-data:
+    driver: local
diff --git a/pyproject.toml b/pyproject.toml
index 334140dc..0454b2d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ dev = [
 ]
 
 server = [
+    "boto3>=1.28.0",  # AWS SDK for S3 support
     "fastapi[standard]>=0.109.1",  # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
     "prometheus-client",
     "sentry-sdk[fastapi]",
diff --git a/requirements.txt b/requirements.txt
index 712360e9..bdefb957 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+boto3>=1.28.0  # AWS SDK for S3 support
 click>=8.0.0
 fastapi[standard]>=0.109.1  # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
 httpx
diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index 65b3f065..6262f0db 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -44,9 +44,9 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
     host = parsed_url.netloc
     user, repo = _get_user_and_repo_from_path(parsed_url.path)
 
-    _id = str(uuid.uuid4())
+    _id = uuid.uuid4()
     slug = f"{user}-{repo}"
-    local_path = TMP_BASE_PATH / _id / slug
+    local_path = TMP_BASE_PATH / str(_id) / slug
     url = f"https://{host}/{user}/{repo}"
 
     query = IngestionQuery(
@@ -132,7 +132,7 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery:
     """
     path_obj = Path(path_str).resolve()
     slug = path_obj.name if path_str == "." else path_str.strip("/")
-    return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
+    return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4())
 
 
 async def _configure_branch_or_tag(
diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py
index 97e98804..92572aeb 100644
--- a/src/gitingest/schemas/ingestion.py
+++ b/src/gitingest/schemas/ingestion.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from pathlib import Path  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
+from uuid import UUID  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
 
 from pydantic import BaseModel, Field
 
@@ -27,7 +28,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
         The URL of the repository.
     slug : str
         The slug of the repository.
-    id : str
+    id : UUID
         The ID of the repository.
     subpath : str
         The subpath to the repository or file (default: ``"/"``).
@@ -47,6 +48,8 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
         The patterns to include.
     include_submodules : bool
         Whether to include all Git submodules within the repository. (default: ``False``)
+    s3_url : str | None
+        The S3 URL where the digest is stored if S3 is enabled.
 
     """
 
@@ -56,7 +59,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
     local_path: Path
     url: str | None = None
     slug: str
-    id: str
+    id: UUID
     subpath: str = Field(default="/")
     type: str | None = None
     branch: str | None = None
@@ -66,6 +69,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
     ignore_patterns: set[str] = Field(default_factory=set)  # TODO: ssame type for ignore_* and include_* patterns
     include_patterns: set[str] | None = None
     include_submodules: bool = Field(default=False)
+    s3_url: str | None = None
 
     def extract_clone_config(self) -> CloneConfig:
         """Extract the relevant fields for the CloneConfig object.
diff --git a/src/server/models.py b/src/server/models.py
index 1ed95710..a1aed314 100644
--- a/src/server/models.py
+++ b/src/server/models.py
@@ -71,8 +71,8 @@ class IngestSuccessResponse(BaseModel):
         Short form of repository URL (user/repo).
     summary : str
         Summary of the ingestion process including token estimates.
-    ingest_id : str
-        Ingestion id used to download full context.
+    digest_url : str
+        URL to download the full digest content (either S3 URL or local download endpoint).
     tree : str
         File tree structure of the repository.
     content : str
@@ -89,7 +89,7 @@ class IngestSuccessResponse(BaseModel):
     repo_url: str = Field(..., description="Original repository URL")
     short_repo_url: str = Field(..., description="Short repository URL (user/repo)")
     summary: str = Field(..., description="Ingestion summary with token estimates")
-    ingest_id: str = Field(..., description="Ingestion id used to download full context")
+    digest_url: str = Field(..., description="URL to download the full digest content")
     tree: str = Field(..., description="File tree structure")
     content: str = Field(..., description="Processed file content")
     default_max_file_size: int = Field(..., description="File size slider position used")
diff --git a/src/server/query_processor.py b/src/server/query_processor.py
index a7b60f61..88d7ff50 100644
--- a/src/server/query_processor.py
+++ b/src/server/query_processor.py
@@ -11,6 +11,7 @@
 from gitingest.utils.git_utils import validate_github_token
 from gitingest.utils.pattern_utils import process_patterns
 from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
+from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
 from server.server_config import MAX_DISPLAY_SIZE
 from server.server_utils import Colors, log_slider_to_size
 
@@ -45,6 +46,11 @@ async def process_query(
     IngestResponse
         A union type, corresponding to IngestErrorResponse or IngestSuccessResponse
 
+    Raises
+    ------
+    RuntimeError
+        If the commit hash is not found (should never happen).
+
     """
     if token:
         validate_github_token(token)
@@ -59,7 +65,6 @@ async def process_query(
         return IngestErrorResponse(error=str(exc))
 
     query.url = cast("str", query.url)
-    query.host = cast("str", query.host)
     query.max_file_size = max_file_size
     query.ignore_patterns, query.include_patterns = process_patterns(
         exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None,
@@ -71,13 +76,36 @@ async def process_query(
 
     short_repo_url = f"{query.user_name}/{query.repo_name}"  # Sets the "<user>/<repo>" for the page title
 
+    # The commit hash should always be available at this point
+    if not query.commit:
+        msg = "Unexpected error: no commit hash found"
+        raise RuntimeError(msg)
+
     try:
         summary, tree, content = ingest_query(query)
 
-        # TODO: why are we writing the tree and content to a file here?
-        local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
-        with local_txt_file.open("w", encoding="utf-8") as f:
-            f.write(tree + "\n" + content)
+        # Prepare the digest content (tree + content)
+        digest_content = tree + "\n" + content
+
+        # Store digest based on S3 configuration
+        if is_s3_enabled():
+            # Upload to S3 instead of storing locally
+            s3_file_path = generate_s3_file_path(
+                source=query.url,
+                user_name=cast("str", query.user_name),
+                repo_name=cast("str", query.repo_name),
+                commit=query.commit,
+                include_patterns=query.include_patterns,
+                ignore_patterns=query.ignore_patterns,
+            )
+            s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
+            # Store S3 URL in query for later use
+            query.s3_url = s3_url
+        else:
+            # Store locally
+            local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
+            with local_txt_file.open("w", encoding="utf-8") as f:
+                f.write(digest_content)
 
     except Exception as exc:
         _print_error(query.url, exc, max_file_size, pattern_type, pattern)
@@ -97,11 +125,21 @@ async def process_query(
         summary=summary,
     )
 
+    # Generate digest_url based on S3 configuration
+    if is_s3_enabled():
+        digest_url = getattr(query, "s3_url", None)
+        if not digest_url:
+            # This should not happen if S3 upload was successful
+            msg = "S3 is enabled but no S3 URL was generated"
+            raise RuntimeError(msg)
+    else:
+        digest_url = f"/api/download/file/{query.id}"
+
     return IngestSuccessResponse(
         repo_url=input_text,
         short_repo_url=short_repo_url,
         summary=summary,
-        ingest_id=query.id,
+        digest_url=digest_url,
         tree=tree,
         content=content,
         default_max_file_size=slider_position,
diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py
index 521b7de0..42efefdf 100644
--- a/src/server/routers/ingest.py
+++ b/src/server/routers/ingest.py
@@ -1,12 +1,16 @@
 """Ingest endpoint for the API."""
 
+from typing import Union
+from uuid import UUID
+
 from fastapi import APIRouter, HTTPException, Request, status
-from fastapi.responses import FileResponse, JSONResponse
+from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
 from prometheus_client import Counter
 
 from gitingest.config import TMP_BASE_PATH
 from server.models import IngestRequest
 from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion
+from server.s3_utils import is_s3_enabled
 from server.server_config import MAX_DISPLAY_SIZE
 from server.server_utils import limiter
 
@@ -39,7 +43,7 @@ async def api_ingest(
     response = await _perform_ingestion(
         input_text=ingest_request.input_text,
         max_file_size=ingest_request.max_file_size,
-        pattern_type=ingest_request.pattern_type,
+        pattern_type=ingest_request.pattern_type.value,
         pattern=ingest_request.pattern,
         token=ingest_request.token,
     )
@@ -90,30 +94,42 @@ async def api_ingest_get(
     return response
 
 
-@router.get("/api/download/file/{ingest_id}", response_class=FileResponse)
-async def download_ingest(ingest_id: str) -> FileResponse:
+@router.get("/api/download/file/{ingest_id}", response_model=None)
+async def download_ingest(
+    ingest_id: UUID,
+) -> Union[RedirectResponse, FileResponse]:  # noqa: FA100 (future-rewritable-type-annotation) (pydantic)
     """Download the first text file produced for an ingest ID.
 
     **This endpoint retrieves the first ``*.txt`` file produced during the ingestion process**
-    and returns it as a downloadable file. The file is streamed with media type ``text/plain``
-    and prompts the browser to download it.
+    and returns it as a downloadable file. When S3 is enabled, this endpoint is disabled
+    and clients should use the S3 URL provided in the ingest response instead.
 
     **Parameters**
 
-    - **ingest_id** (`str`): Identifier that the ingest step emitted
+    - **ingest_id** (`UUID`): Identifier that the ingest step emitted
 
     **Returns**
 
-    - **FileResponse**: Streamed response with media type ``text/plain``
+    - **FileResponse**: Streamed response with media type ``text/plain`` for local files
 
     **Raises**
 
+    - **HTTPException**: **503** - endpoint is disabled when S3 is enabled
     - **HTTPException**: **404** - digest directory is missing or contains no ``*.txt`` file
     - **HTTPException**: **403** - the process lacks permission to read the directory or file
 
     """
+    # Disable download endpoint when S3 is enabled
+    if is_s3_enabled():
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Download endpoint is disabled when S3 is enabled. "
+            "Use the S3 URL provided in the ingest response instead.",
+        )
+
+    # Fall back to local file serving
     # Normalize and validate the directory path
-    directory = (TMP_BASE_PATH / ingest_id).resolve()
+    directory = (TMP_BASE_PATH / str(ingest_id)).resolve()
     if not str(directory).startswith(str(TMP_BASE_PATH.resolve())):
         raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")
 
diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py
new file mode 100644
index 00000000..07ebdbe4
--- /dev/null
+++ b/src/server/s3_utils.py
@@ -0,0 +1,341 @@
+"""S3 utility functions for uploading and managing digest files."""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import os
+from typing import TYPE_CHECKING
+from urllib.parse import urlparse
+from uuid import UUID  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
+
+import boto3
+from botocore.exceptions import ClientError
+
+if TYPE_CHECKING:
+    from botocore.client import BaseClient
+
+# Initialize logger for this module
+logger = logging.getLogger(__name__)
+
+
+class S3UploadError(Exception):
+    """Custom exception for S3 upload failures."""
+
+
+def is_s3_enabled() -> bool:
+    """Check if S3 is enabled via environment variables."""
+    return os.getenv("S3_ENABLED", "false").lower() == "true"
+
+
+def get_s3_config() -> dict[str, str | None]:
+    """Get S3 configuration from environment variables."""
+    config = {
+        "endpoint_url": os.getenv("S3_ENDPOINT"),
+        "aws_access_key_id": os.getenv("S3_ACCESS_KEY"),
+        "aws_secret_access_key": os.getenv("S3_SECRET_KEY"),
+        "region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"),
+    }
+    return {k: v for k, v in config.items() if v is not None}
+
+
+def get_s3_bucket_name() -> str:
+    """Get S3 bucket name from environment variables."""
+    return os.getenv("S3_BUCKET_NAME", "gitingest-bucket")
+
+
+def get_s3_alias_host() -> str | None:
+    """Get S3 alias host for public URLs."""
+    return os.getenv("S3_ALIAS_HOST")
+
+
+def generate_s3_file_path(
+    source: str,
+    user_name: str,
+    repo_name: str,
+    commit: str,
+    include_patterns: set[str] | None,
+    ignore_patterns: set[str],
+) -> str:
+    """Generate S3 file path with proper naming convention.
+
+    The file path is formatted as:
+    [<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
+
+    If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
+    The commit-ID is always included in the URL.
+    If no specific commit is provided, the actual commit hash from the cloned repository is used.
+
+    Parameters
+    ----------
+    source : str
+        Git host (e.g., github, gitlab, bitbucket, etc.).
+    user_name : str
+        Repository owner or user.
+    repo_name : str
+        Repository name.
+    commit : str
+        Commit hash.
+    include_patterns : set[str] | None
+        Set of patterns specifying which files to include.
+    ignore_patterns : set[str]
+        Set of patterns specifying which files to exclude.
+
+    Returns
+    -------
+    str
+        S3 file path string.
+
+    Raises
+    ------
+    ValueError
+        If the source URL is invalid.
+
+    """
+    hostname = urlparse(source).hostname
+    if hostname is None:
+        msg = "Invalid source URL"
+        logger.error(msg)
+        raise ValueError(msg)
+
+    # Extract source from URL or default to "unknown"
+    git_source = {
+        "github.com": "github",
+        "gitlab.com": "gitlab",
+        "bitbucket.org": "bitbucket",
+    }.get(hostname, "unknown")
+
+    # Create hash of exclude/include patterns for uniqueness
+    patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
+    patterns_str += f"exclude:{sorted(ignore_patterns)}"
+    patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
+
+    # Build the base path
+    base_path = f"ingest/{git_source}/{user_name}/{repo_name}/{commit}/{patterns_hash}.txt"
+
+    # Check for S3_DIRECTORY_PREFIX environment variable
+    s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")
+
+    if not s3_directory_prefix:
+        return base_path
+
+    # Remove trailing slash if present and add the prefix
+    s3_directory_prefix = s3_directory_prefix.rstrip("/")
+    return f"{s3_directory_prefix}/{base_path}"
+
+
+def create_s3_client() -> BaseClient:
+    """Create and return an S3 client with configuration from environment."""
+    config = get_s3_config()
+    # Log S3 client creation (excluding sensitive info)
+    log_config = config.copy()
+    has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None))
+    logger.debug(
+        msg="Creating S3 client",
+        extra={
+            "s3_config": log_config,
+            "has_credentials": has_credentials,
+        },
+    )
+    return boto3.client("s3", **config)
+
+
+def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
+    """Upload content to S3 and return the public URL.
+
+    This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file.
+    The ingest ID is stored as an S3 object tag.
+
+    Parameters
+    ----------
+    content : str
+        The digest content to upload.
+    s3_file_path : str
+        The S3 file path where the content will be stored.
+    ingest_id : UUID
+        The ingest ID to store as an S3 object tag.
+
+    Returns
+    -------
+    str
+        Public URL to access the uploaded file.
+
+    Raises
+    ------
+    ValueError
+        If S3 is not enabled.
+    S3UploadError
+        If the upload to S3 fails.
+
+    """
+    if not is_s3_enabled():
+        msg = "S3 is not enabled"
+        logger.error(msg)
+        raise ValueError(msg)
+
+    s3_client = create_s3_client()
+    bucket_name = get_s3_bucket_name()
+
+    extra_fields = {
+        "bucket_name": bucket_name,
+        "s3_file_path": s3_file_path,
+        "ingest_id": str(ingest_id),
+        "content_size": len(content),
+    }
+
+    # Log upload attempt
+    logger.debug("Starting S3 upload", extra=extra_fields)
+
+    try:
+        # Upload the content with ingest_id as tag
+        s3_client.put_object(
+            Bucket=bucket_name,
+            Key=s3_file_path,
+            Body=content.encode("utf-8"),
+            ContentType="text/plain",
+            Tagging=f"ingest_id={ingest_id!s}",
+        )
+    except ClientError as err:
+        # Log upload failure
+        logger.exception(
+            "S3 upload failed",
+            extra={
+                "bucket_name": bucket_name,
+                "s3_file_path": s3_file_path,
+                "ingest_id": str(ingest_id),
+                "error_code": err.response.get("Error", {}).get("Code"),
+                "error_message": str(err),
+            },
+        )
+        msg = f"Failed to upload to S3: {err}"
+        raise S3UploadError(msg) from err
+
+    # Generate public URL
+    alias_host = get_s3_alias_host()
+    if alias_host:
+        # Use alias host if configured
+        public_url = f"{alias_host.rstrip('/')}/{s3_file_path}"
+    else:
+        # Fallback to direct S3 URL
+        endpoint = get_s3_config().get("endpoint_url")
+        if endpoint:
+            public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}"
+        else:
+            public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}"
+
+    # Log successful upload
+    logger.debug(
+        "S3 upload completed successfully",
+        extra={
+            "bucket_name": bucket_name,
+            "s3_file_path": s3_file_path,
+            "ingest_id": str(ingest_id),
+            "public_url": public_url,
+        },
+    )
+
+    return public_url
+
+
+def _build_s3_url(key: str) -> str:
+    """Build S3 URL for a given key."""
+    alias_host = get_s3_alias_host()
+    if alias_host:
+        return f"{alias_host.rstrip('/')}/{key}"
+
+    bucket_name = get_s3_bucket_name()
+    config = get_s3_config()
+
+    endpoint = config["endpoint_url"]
+    if endpoint:
+        return f"{endpoint.rstrip('/')}/{bucket_name}/{key}"
+
+    return f"https://{bucket_name}.s3.{config['region_name']}.amazonaws.com/{key}"
+
+
+def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool:
+    """Check if an S3 object has the matching ingest_id tag."""
+    try:
+        tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key)
+        tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])}
+        return tags.get("ingest_id") == str(target_ingest_id)
+    except ClientError:
+        return False
+
+
+def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
+    """Get S3 URL for a given ingest ID if it exists.
+
+    Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found.
+    Used by the download endpoint to redirect to S3 if available.
+
+    Parameters
+    ----------
+    ingest_id : UUID
+        The ingest ID to search for in S3 object tags.
+
+    Returns
+    -------
+    str | None
+        S3 URL if file exists, None otherwise.
+
+    """
+    if not is_s3_enabled():
+        logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id)
+        return None
+
+    logger.debug(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)})
+
+    try:
+        s3_client = create_s3_client()
+        bucket_name = get_s3_bucket_name()
+
+        # List all objects in the ingest/ prefix and check their tags
+        paginator = s3_client.get_paginator("list_objects_v2")
+        page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/")
+
+        objects_checked = 0
+        for page in page_iterator:
+            if "Contents" not in page:
+                continue
+
+            for obj in page["Contents"]:
+                key = obj["Key"]
+                objects_checked += 1
+                if _check_object_tags(
+                    s3_client=s3_client,
+                    bucket_name=bucket_name,
+                    key=key,
+                    target_ingest_id=ingest_id,
+                ):
+                    s3_url = _build_s3_url(key)
+                    logger.debug(
+                        msg="Found S3 object for ingest ID",
+                        extra={
+                            "ingest_id": str(ingest_id),
+                            "s3_key": key,
+                            "s3_url": s3_url,
+                            "objects_checked": objects_checked,
+                        },
+                    )
+                    return s3_url
+
+        logger.debug(
+            msg="No S3 object found for ingest ID",
+            extra={
+                "ingest_id": str(ingest_id),
+                "objects_checked": objects_checked,
+            },
+        )
+
+    except ClientError as err:
+        logger.exception(
+            msg="Error during S3 URL lookup",
+            extra={
+                "ingest_id": str(ingest_id),
+                "error_code": err.response.get("Error", {}).get("Code"),
+                "error_message": str(err),
+            },
+        )
+
+    return None
diff --git a/src/static/js/utils.js b/src/static/js/utils.js
index b20222be..6370036b 100644
--- a/src/static/js/utils.js
+++ b/src/static/js/utils.js
@@ -172,8 +172,8 @@ function handleSuccessfulResponse(data) {
     // Show results section
     showResults();
 
-    // Store the ingest_id for download functionality
-    window.currentIngestId = data.ingest_id;
+    // Store the digest_url for download functionality
+    window.currentDigestUrl = data.digest_url;
 
     // Set plain text content for summary, tree, and content
     document.getElementById('result-summary').value = data.summary || '';
@@ -271,9 +271,9 @@ function copyFullDigest() {
 }
 
 function downloadFullDigest() {
-    // Check if we have an ingest_id
-    if (!window.currentIngestId) {
-        console.error('No ingest_id available for download');
+    // Check if we have a digest_url
+    if (!window.currentDigestUrl) {
+        console.error('No digest_url available for download');
 
         return;
     }
@@ -289,10 +289,10 @@ function downloadFullDigest() {
         Downloading...
     `;
 
-    // Create a download link to the server endpoint
+    // Create a download link using the digest_url
     const a = document.createElement('a');
 
-    a.href = `/api/download/file/${window.currentIngestId}`;
+    a.href = window.currentDigestUrl;
     a.download = 'digest.txt';
     document.body.appendChild(a);
     a.click();
diff --git a/tests/conftest.py b/tests/conftest.py
index 0e279726..fc97551f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -8,6 +8,7 @@
 
 import json
 import sys
+import uuid
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict
 from unittest.mock import AsyncMock
@@ -62,7 +63,7 @@ def sample_query() -> IngestionQuery:
         repo_name="test_repo",
         local_path=Path("/tmp/test_repo").resolve(),
         slug="test_user/test_repo",
-        id="id",
+        id=uuid.uuid4(),
         branch="main",
         max_file_size=1_000_000,
         ignore_patterns={"*.pyc", "__pycache__", ".git"},
diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py
index 342d9882..ce95aa9b 100644
--- a/tests/query_parser/test_git_host_agnostic.py
+++ b/tests/query_parser/test_git_host_agnostic.py
@@ -55,7 +55,7 @@ async def test_parse_query_without_host(
     query = await parse_remote_repo(url)
 
     # Compare against the canonical dict while ignoring unpredictable fields.
-    actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"})
+    actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns", "s3_url"})
 
     assert "commit" in actual
     assert _is_valid_git_commit_hash(actual["commit"])
diff --git a/tests/server/test_flow_integration.py b/tests/server/test_flow_integration.py
index 2129c0d9..31c474dd 100644
--- a/tests/server/test_flow_integration.py
+++ b/tests/server/test_flow_integration.py
@@ -50,7 +50,7 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non
     client = request.getfixturevalue("test_client")
     form_data = {
         "input_text": "https://github.com/octocat/Hello-World",
-        "max_file_size": "243",
+        "max_file_size": 243,
         "pattern_type": "exclude",
         "pattern": "",
         "token": "",
@@ -75,7 +75,7 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None:
     client = request.getfixturevalue("test_client")
     form_data = {
         "input_text": "https://github.com/nonexistent/repo",
-        "max_file_size": "243",
+        "max_file_size": 243,
         "pattern_type": "exclude",
         "pattern": "",
         "token": "",
@@ -97,7 +97,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None:
     # TODO: ingesting a large repo take too much time (eg: godotengine/godot repository)
     form_data = {
         "input_text": "https://github.com/octocat/hello-world",
-        "max_file_size": "10",
+        "max_file_size": 10,
         "pattern_type": "exclude",
         "pattern": "",
         "token": "",
@@ -122,7 +122,7 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None:
     def make_request() -> None:
         form_data = {
             "input_text": "https://github.com/octocat/hello-world",
-            "max_file_size": "243",
+            "max_file_size": 243,
             "pattern_type": "exclude",
             "pattern": "",
             "token": "",
@@ -149,7 +149,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None:
     client = request.getfixturevalue("test_client")
     form_data = {
         "input_text": "https://github.com/octocat/Hello-World",
-        "max_file_size": "1",
+        "max_file_size": 1,
         "pattern_type": "exclude",
         "pattern": "",
         "token": "",
@@ -172,7 +172,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None:
     client = request.getfixturevalue("test_client")
     form_data = {
         "input_text": "https://github.com/octocat/Hello-World",
-        "max_file_size": "243",
+        "max_file_size": 243,
         "pattern_type": "include",
         "pattern": "*.md",
         "token": "",

From b9a42bca9b782dccb831e648d3805e337deca816 Mon Sep 17 00:00:00 2001
From: mickael <contact@mickael-caudrelier.fr>
Date: Sat, 26 Jul 2025 15:47:46 +0200
Subject: [PATCH 2/3] feat: add S3_DIRECTORY_PREFIX env variable in compose
 file

---
 compose.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compose.yml b/compose.yml
index defe28cd..ac0afdbd 100644
--- a/compose.yml
+++ b/compose.yml
@@ -53,6 +53,7 @@ services:
       # Use lowercase bucket name to ensure compatibility with MinIO
       - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
       - S3_REGION=${S3_REGION:-us-east-1}
+      - S3_DIRECTORY_PREFIX=${S3_DIRECTORY_PREFIX:-dev}
       # Public URL for S3 resources
       - S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}}
     volumes:

From 6a65cb7d54a3230f79f941f96414e6816ba575fe Mon Sep 17 00:00:00 2001
From: mickael <contact@mickael-caudrelier.fr>
Date: Sat, 26 Jul 2025 16:11:45 +0200
Subject: [PATCH 3/3] feat: update S3 file path format to include hostname and
 structured naming

---
 src/server/s3_utils.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/server/s3_utils.py b/src/server/s3_utils.py
index 07ebdbe4..a30a957f 100644
--- a/src/server/s3_utils.py
+++ b/src/server/s3_utils.py
@@ -60,7 +60,8 @@ def generate_s3_file_path(
     """Generate S3 file path with proper naming convention.
 
     The file path is formatted as:
-    [<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
+    [<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/
+    <exclude&include hash>/<owner>-<repo-name>.txt
 
     If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
     The commit-ID is always included in the URL.
@@ -98,20 +99,13 @@ def generate_s3_file_path(
         logger.error(msg)
         raise ValueError(msg)
 
-    # Extract source from URL or default to "unknown"
-    git_source = {
-        "github.com": "github",
-        "gitlab.com": "gitlab",
-        "bitbucket.org": "bitbucket",
-    }.get(hostname, "unknown")
-
     # Create hash of exclude/include patterns for uniqueness
     patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
     patterns_str += f"exclude:{sorted(ignore_patterns)}"
     patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
 
-    # Build the base path
-    base_path = f"ingest/{git_source}/{user_name}/{repo_name}/{commit}/{patterns_hash}.txt"
+    # Build the base path using hostname directly
+    base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt"
 
     # Check for S3_DIRECTORY_PREFIX environment variable
     s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")