Skip to content

Commit f94bbf9

Browse files
committed
feat: implement S3 integration for storing and retrieving digest files
- Add utility functions for S3 configuration, URL generation, and file uploads. - Enhance ingestion flow to optionally upload digests to S3 if enabled. - Modify API endpoints to redirect downloads to S3 if files are stored there. - Extend `IngestResponse` schema to include S3 URL when applicable. - Introduce `get_current_commit_hash` utility to retrieve commit SHA in ingestion.
1 parent 47e9ab3 commit f94bbf9

File tree

7 files changed

+328
-22
lines changed

7 files changed

+328
-22
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,8 @@ The file defines three services:
274274
- Uses the `dev` profile (only available in development mode)
275275
- Provides S3-compatible storage for local development
276276
- Accessible via:
277-
- API: Port 9000 (http://localhost:9000)
278-
- Web Console: Port 9001 (http://localhost:9001)
277+
- API: Port 9000 ([localhost:9000](http://localhost:9000))
278+
- Web Console: Port 9001 ([localhost:9001](http://localhost:9001))
279279
- Default admin credentials:
280280
- Username: `minioadmin`
281281
- Password: `minioadmin`

src/gitingest/schemas/ingestion.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
8383
The patterns to include.
8484
include_submodules : bool
8585
Whether to include all Git submodules within the repository. (default: ``False``)
86+
s3_url : str | None
87+
The S3 URL where the digest is stored if S3 is enabled (default: ``None``).
8688
8789
"""
8890

@@ -101,6 +103,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
101103
ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type
102104
include_patterns: set[str] | None = None
103105
include_submodules: bool = False
106+
s3_url: str | None = None
104107

105108
def extract_clone_config(self) -> CloneConfig:
106109
"""Extract the relevant fields for the CloneConfig object.

src/gitingest/utils/git_utils.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,33 @@ def create_git_auth_header(token: str, url: str = "https://github.com") -> str:
305305
return f"http.https://{hostname}/.extraheader=Authorization: Basic {basic}"
306306

307307

308+
async def get_current_commit_hash(local_path: str) -> str:
309+
"""Get the current commit hash from a cloned repository.
310+
311+
Parameters
312+
----------
313+
local_path : str
314+
The local path to the cloned repository.
315+
316+
Returns
317+
-------
318+
str
319+
The current commit hash (SHA).
320+
321+
Raises
322+
------
323+
RuntimeError
324+
If unable to get the commit hash.
325+
326+
"""
327+
try:
328+
stdout, _ = await run_command("git", "-C", local_path, "rev-parse", "HEAD")
329+
return stdout.decode().strip()
330+
except RuntimeError as exc:
331+
msg = f"Failed to get commit hash from {local_path}: {exc}"
332+
raise RuntimeError(msg) from exc
333+
334+
308335
def validate_github_token(token: str) -> None:
309336
"""Validate the format of a GitHub Personal Access Token.
310337

src/gitingest/utils/s3_utils.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
"""S3 utility functions for uploading and managing digest files."""
2+
3+
from __future__ import annotations
4+
5+
import hashlib
6+
import os
7+
from typing import Any
8+
9+
import boto3
10+
from botocore.exceptions import ClientError
11+
12+
13+
class S3UploadError(Exception):
14+
"""Custom exception for S3 upload failures."""
15+
16+
17+
def is_s3_enabled() -> bool:
18+
"""Check if S3 is enabled via environment variables."""
19+
return os.getenv("S3_ENABLED", "false").lower() == "true"
20+
21+
22+
def get_s3_config() -> dict[str, Any]:
23+
"""Get S3 configuration from environment variables."""
24+
return {
25+
"endpoint_url": os.getenv("S3_ENDPOINT"),
26+
"aws_access_key_id": os.getenv("S3_ACCESS_KEY"),
27+
"aws_secret_access_key": os.getenv("S3_SECRET_KEY"),
28+
"region_name": os.getenv("S3_REGION", "us-east-1"),
29+
}
30+
31+
32+
def get_s3_bucket_name() -> str:
33+
"""Get S3 bucket name from environment variables."""
34+
return os.getenv("S3_BUCKET_NAME", "gitingest-bucket")
35+
36+
37+
def get_s3_alias_host() -> str | None:
38+
"""Get S3 alias host for public URLs."""
39+
return os.getenv("S3_ALIAS_HOST")
40+
41+
42+
def generate_s3_file_path(
43+
source: str,
44+
user_name: str,
45+
repo_name: str,
46+
branch: str | None,
47+
commit: str | None,
48+
include_patterns: set[str] | None,
49+
ignore_patterns: set[str],
50+
) -> str:
51+
"""Generate S3 file path with proper naming convention.
52+
53+
Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
54+
The commit-ID is always included in the URL. If no specific commit is provided,
55+
the actual commit hash from the cloned repository is used.
56+
57+
Args:
58+
source: Git host (github, gitlab, etc.)
59+
user_name: Repository owner/user
60+
repo_name: Repository name
61+
branch: Branch name (if available)
62+
commit: Commit hash (should always be available now)
63+
include_patterns: Include patterns set
64+
ignore_patterns: Ignore patterns set
65+
66+
Returns:
67+
S3 file path string
68+
69+
"""
70+
# Extract source from URL or default to "unknown"
71+
if "github.com" in source:
72+
git_source = "github"
73+
elif "gitlab.com" in source:
74+
git_source = "gitlab"
75+
elif "bitbucket.org" in source:
76+
git_source = "bitbucket"
77+
else:
78+
git_source = "unknown"
79+
80+
# Use branch, fallback to "main" if neither branch nor commit
81+
branch_name = branch or "main"
82+
83+
# Create hash of exclude/include patterns for uniqueness
84+
patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
85+
patterns_str += f"exclude:{sorted(ignore_patterns)}"
86+
87+
patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
88+
89+
# Commit should always be available now, but provide fallback just in case
90+
commit_id = commit or "HEAD"
91+
92+
# Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<hash>.txt
93+
return f"ingest/{git_source}/{user_name}/{repo_name}/{branch_name}/{commit_id}/{patterns_hash}.txt"
94+
95+
96+
def create_s3_client() -> boto3.client:
97+
"""Create and return an S3 client with configuration from environment."""
98+
config = get_s3_config()
99+
return boto3.client("s3", **config)
100+
101+
102+
def upload_to_s3(content: str, s3_file_path: str, ingest_id: str) -> str:
103+
"""Upload content to S3 and return the public URL.
104+
105+
Args:
106+
content: The digest content to upload
107+
s3_file_path: The S3 file path
108+
ingest_id: The ingest ID to store as S3 object tag
109+
110+
Returns:
111+
Public URL to access the uploaded file
112+
113+
Raises:
114+
Exception: If upload fails
115+
116+
"""
117+
if not is_s3_enabled():
118+
msg = "S3 is not enabled"
119+
raise ValueError(msg)
120+
121+
try:
122+
s3_client = create_s3_client()
123+
bucket_name = get_s3_bucket_name()
124+
125+
# Upload the content with ingest_id as tag
126+
s3_client.put_object(
127+
Bucket=bucket_name,
128+
Key=s3_file_path,
129+
Body=content.encode("utf-8"),
130+
ContentType="text/plain",
131+
Tagging=f"ingest_id={ingest_id}",
132+
)
133+
134+
# Generate public URL
135+
alias_host = get_s3_alias_host()
136+
if alias_host:
137+
# Use alias host if configured
138+
return f"{alias_host.rstrip('/')}/{s3_file_path}"
139+
# Fallback to direct S3 URL
140+
endpoint = get_s3_config()["endpoint_url"]
141+
if endpoint:
142+
return f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}"
143+
return f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}"
144+
145+
except ClientError as e:
146+
msg = f"Failed to upload to S3: {e}"
147+
raise S3UploadError(msg) from e
148+
149+
150+
def _build_s3_url(key: str) -> str:
151+
"""Build S3 URL for a given key."""
152+
alias_host = get_s3_alias_host()
153+
if alias_host:
154+
return f"{alias_host.rstrip('/')}/{key}"
155+
endpoint = get_s3_config()["endpoint_url"]
156+
if endpoint:
157+
bucket_name = get_s3_bucket_name()
158+
return f"{endpoint.rstrip('/')}/{bucket_name}/{key}"
159+
bucket_name = get_s3_bucket_name()
160+
return f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{key}"
161+
162+
163+
def _check_object_tags(s3_client: boto3.client, bucket_name: str, key: str, target_ingest_id: str) -> bool:
164+
"""Check if an S3 object has the matching ingest_id tag."""
165+
try:
166+
tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key)
167+
tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])}
168+
return tags.get("ingest_id") == target_ingest_id
169+
except ClientError:
170+
return False
171+
172+
173+
def get_s3_url_for_ingest_id(ingest_id: str) -> str | None:
174+
"""Get S3 URL for a given ingest ID if it exists.
175+
176+
This is used by the download endpoint to redirect to S3 if available.
177+
Searches for files using S3 object tags to find the matching ingest_id.
178+
179+
Args:
180+
ingest_id: The ingest ID
181+
182+
Returns:
183+
S3 URL if file exists, None otherwise
184+
185+
"""
186+
if not is_s3_enabled():
187+
return None
188+
189+
try:
190+
s3_client = create_s3_client()
191+
bucket_name = get_s3_bucket_name()
192+
193+
# List all objects in the ingest/ prefix and check their tags
194+
paginator = s3_client.get_paginator("list_objects_v2")
195+
page_iterator = paginator.paginate(
196+
Bucket=bucket_name,
197+
Prefix="ingest/",
198+
)
199+
200+
for page in page_iterator:
201+
if "Contents" not in page:
202+
continue
203+
204+
for obj in page["Contents"]:
205+
key = obj["Key"]
206+
if _check_object_tags(s3_client, bucket_name, key, ingest_id):
207+
return _build_s3_url(key)
208+
209+
except ClientError:
210+
pass
211+
212+
return None

src/server/query_processor.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
from gitingest.clone import clone_repo
99
from gitingest.ingestion import ingest_query
1010
from gitingest.query_parser import IngestionQuery, parse_query
11-
from gitingest.utils.git_utils import validate_github_token
11+
from gitingest.utils.git_utils import get_current_commit_hash, validate_github_token
12+
from gitingest.utils.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
1213
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse
1314
from server.server_config import MAX_DISPLAY_SIZE
1415
from server.server_utils import Colors, log_slider_to_size
@@ -85,12 +86,39 @@ async def process_query(
8586
clone_config = query.extract_clone_config()
8687
await clone_repo(clone_config, token=token)
8788

88-
summary, tree, content = ingest_query(query)
89+
# Get the current commit hash if not already available
90+
if not query.commit:
91+
try:
92+
query.commit = await get_current_commit_hash(clone_config.local_path)
93+
except RuntimeError:
94+
# If we can't get the commit hash, use a default
95+
query.commit = "HEAD"
8996

90-
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
97+
summary, tree, content = ingest_query(query)
9198

92-
with local_txt_file.open("w", encoding="utf-8") as f:
93-
f.write(tree + "\n" + content)
99+
# Prepare the digest content
100+
digest_content = tree + "\n" + content
101+
102+
# Store digest based on S3 configuration
103+
if is_s3_enabled():
104+
# Upload to S3 instead of storing locally
105+
s3_file_path = generate_s3_file_path(
106+
source=query.url or "",
107+
user_name=query.user_name or "",
108+
repo_name=query.repo_name or "",
109+
branch=query.branch,
110+
commit=query.commit,
111+
include_patterns=query.include_patterns,
112+
ignore_patterns=query.ignore_patterns,
113+
)
114+
s3_url = upload_to_s3(digest_content, s3_file_path, query.id)
115+
# Store S3 URL in query for later use
116+
query.s3_url = s3_url
117+
else:
118+
# Store locally as before
119+
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
120+
with local_txt_file.open("w", encoding="utf-8") as f:
121+
f.write(digest_content)
94122

95123
except Exception as exc:
96124
if query and query.url:

0 commit comments

Comments
 (0)