Skip to content

Commit efe5a26

Browse files
feat: serve cached digest if available (#462)
Co-authored-by: Nicolas IRAGNE <nicoragne@hotmail.fr>
1 parent a63ed9e commit efe5a26

File tree

4 files changed

+427
-38
lines changed

4 files changed

+427
-38
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
test:
1818
runs-on: ${{ matrix.os }}
1919
strategy:
20-
fail-fast: true
20+
fail-fast: false
2121
matrix:
2222
os: [ubuntu-latest, macos-latest, windows-latest]
2323
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]

src/server/models.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,25 @@ class IngestErrorResponse(BaseModel):
116116
IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse]
117117

118118

119+
class S3Metadata(BaseModel):
120+
"""Model for S3 metadata structure.
121+
122+
Attributes
123+
----------
124+
summary : str
125+
Summary of the ingestion process including token estimates.
126+
tree : str
127+
File tree structure of the repository.
128+
content : str
129+
Processed content from the repository files.
130+
131+
"""
132+
133+
summary: str = Field(..., description="Ingestion summary with token estimates")
134+
tree: str = Field(..., description="File tree structure")
135+
content: str = Field(..., description="Processed file content")
136+
137+
119138
class QueryForm(BaseModel):
120139
"""Form data for the query.
121140

src/server/query_processor.py

Lines changed: 211 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,211 @@
22

33
from __future__ import annotations
44

5+
import logging
56
from pathlib import Path
6-
from typing import cast
7+
from typing import TYPE_CHECKING, cast
78

89
from gitingest.clone import clone_repo
910
from gitingest.ingestion import ingest_query
1011
from gitingest.query_parser import parse_remote_repo
11-
from gitingest.utils.git_utils import validate_github_token
12+
from gitingest.utils.git_utils import resolve_commit, validate_github_token
1213
from gitingest.utils.pattern_utils import process_patterns
13-
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
14-
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
14+
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata
15+
from server.s3_utils import (
16+
_build_s3_url,
17+
check_s3_object_exists,
18+
generate_s3_file_path,
19+
get_metadata_from_s3,
20+
is_s3_enabled,
21+
upload_metadata_to_s3,
22+
upload_to_s3,
23+
)
1524
from server.server_config import MAX_DISPLAY_SIZE
1625
from server.server_utils import Colors
1726

27+
if TYPE_CHECKING:
28+
from gitingest.schemas.cloning import CloneConfig
29+
from gitingest.schemas.ingestion import IngestionQuery
30+
31+
logger = logging.getLogger(__name__)
32+
33+
34+
async def _check_s3_cache(
35+
query: IngestionQuery,
36+
input_text: str,
37+
max_file_size: int,
38+
pattern_type: str,
39+
pattern: str,
40+
token: str | None,
41+
) -> IngestSuccessResponse | None:
42+
"""Check if digest already exists on S3 and return response if found.
43+
44+
Parameters
45+
----------
46+
query : IngestionQuery
47+
The parsed query object.
48+
input_text : str
49+
Original input text.
50+
max_file_size : int
51+
Maximum file size in KB.
52+
pattern_type : str
53+
Pattern type (include/exclude).
54+
pattern : str
55+
Pattern string.
56+
token : str | None
57+
GitHub token.
58+
59+
Returns
60+
-------
61+
IngestSuccessResponse | None
62+
Response if file exists on S3, None otherwise.
63+
64+
"""
65+
if not is_s3_enabled():
66+
return None
67+
68+
try:
69+
# Use git ls-remote to get commit SHA without cloning
70+
clone_config = query.extract_clone_config()
71+
query.commit = await resolve_commit(clone_config, token=token)
72+
# Generate S3 file path using the resolved commit
73+
s3_file_path = generate_s3_file_path(
74+
source=query.url,
75+
user_name=cast("str", query.user_name),
76+
repo_name=cast("str", query.repo_name),
77+
commit=query.commit,
78+
include_patterns=query.include_patterns,
79+
ignore_patterns=query.ignore_patterns,
80+
)
81+
82+
# Check if file exists on S3
83+
if check_s3_object_exists(s3_file_path):
84+
# File exists on S3, serve it directly without cloning
85+
s3_url = _build_s3_url(s3_file_path)
86+
query.s3_url = s3_url
87+
88+
short_repo_url = f"{query.user_name}/{query.repo_name}"
89+
90+
# Try to get cached metadata
91+
metadata = get_metadata_from_s3(s3_file_path)
92+
93+
if metadata:
94+
# Use cached metadata if available
95+
summary = metadata.summary
96+
tree = metadata.tree
97+
content = metadata.content
98+
else:
99+
# Fallback to placeholder messages if metadata not available
100+
summary = "Digest served from cache (S3). Download the full digest to see content details."
101+
tree = "Digest served from cache. Download the full digest to see the file tree."
102+
content = "Digest served from cache. Download the full digest to see the content."
103+
104+
return IngestSuccessResponse(
105+
repo_url=input_text,
106+
short_repo_url=short_repo_url,
107+
summary=summary,
108+
digest_url=s3_url,
109+
tree=tree,
110+
content=content,
111+
default_max_file_size=max_file_size,
112+
pattern_type=pattern_type,
113+
pattern=pattern,
114+
)
115+
except Exception as exc:
116+
# Log the exception but don't fail the entire request
117+
logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc)
118+
119+
return None
120+
121+
122+
def _store_digest_content(
123+
query: IngestionQuery,
124+
clone_config: CloneConfig,
125+
digest_content: str,
126+
summary: str,
127+
tree: str,
128+
content: str,
129+
) -> None:
130+
"""Store digest content either to S3 or locally based on configuration.
131+
132+
Parameters
133+
----------
134+
query : IngestionQuery
135+
The query object containing repository information.
136+
clone_config : CloneConfig
137+
The clone configuration object.
138+
digest_content : str
139+
The complete digest content to store.
140+
summary : str
141+
The summary content for metadata.
142+
tree : str
143+
The tree content for metadata.
144+
content : str
145+
The file content for metadata.
146+
147+
"""
148+
if is_s3_enabled():
149+
# Upload to S3 instead of storing locally
150+
s3_file_path = generate_s3_file_path(
151+
source=query.url,
152+
user_name=cast("str", query.user_name),
153+
repo_name=cast("str", query.repo_name),
154+
commit=query.commit,
155+
include_patterns=query.include_patterns,
156+
ignore_patterns=query.ignore_patterns,
157+
)
158+
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
159+
160+
# Also upload metadata JSON for caching
161+
metadata = S3Metadata(
162+
summary=summary,
163+
tree=tree,
164+
content=content,
165+
)
166+
try:
167+
upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id)
168+
logger.debug("Successfully uploaded metadata to S3")
169+
except Exception as metadata_exc:
170+
# Log the error but don't fail the entire request
171+
logger.warning("Failed to upload metadata to S3: %s", metadata_exc)
172+
173+
# Store S3 URL in query for later use
174+
query.s3_url = s3_url
175+
else:
176+
# Store locally
177+
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
178+
with local_txt_file.open("w", encoding="utf-8") as f:
179+
f.write(digest_content)
180+
181+
182+
def _generate_digest_url(query: IngestionQuery) -> str:
183+
"""Generate the digest URL based on S3 configuration.
184+
185+
Parameters
186+
----------
187+
query : IngestionQuery
188+
The query object containing repository information.
189+
190+
Returns
191+
-------
192+
str
193+
The digest URL.
194+
195+
Raises
196+
------
197+
RuntimeError
198+
If S3 is enabled but no S3 URL was generated.
199+
200+
"""
201+
if is_s3_enabled():
202+
digest_url = getattr(query, "s3_url", None)
203+
if not digest_url:
204+
# This should not happen if S3 upload was successful
205+
msg = "S3 is enabled but no S3 URL was generated"
206+
raise RuntimeError(msg)
207+
return digest_url
208+
return f"/api/download/file/{query.id}"
209+
18210

19211
async def process_query(
20212
input_text: str,
@@ -69,10 +261,22 @@ async def process_query(
69261
include_patterns=pattern if pattern_type == PatternType.INCLUDE else None,
70262
)
71263

264+
# Check if digest already exists on S3 before cloning
265+
s3_response = await _check_s3_cache(
266+
query=query,
267+
input_text=input_text,
268+
max_file_size=max_file_size,
269+
pattern_type=pattern_type.value,
270+
pattern=pattern,
271+
token=token,
272+
)
273+
if s3_response:
274+
return s3_response
275+
72276
clone_config = query.extract_clone_config()
73277
await clone_repo(clone_config, token=token)
74278

75-
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
279+
short_repo_url = f"{query.user_name}/{query.repo_name}"
76280

77281
# The commit hash should always be available at this point
78282
if not query.commit:
@@ -81,30 +285,8 @@ async def process_query(
81285

82286
try:
83287
summary, tree, content = ingest_query(query)
84-
85-
# Prepare the digest content (tree + content)
86288
digest_content = tree + "\n" + content
87-
88-
# Store digest based on S3 configuration
89-
if is_s3_enabled():
90-
# Upload to S3 instead of storing locally
91-
s3_file_path = generate_s3_file_path(
92-
source=query.url,
93-
user_name=cast("str", query.user_name),
94-
repo_name=cast("str", query.repo_name),
95-
commit=query.commit,
96-
include_patterns=query.include_patterns,
97-
ignore_patterns=query.ignore_patterns,
98-
)
99-
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
100-
# Store S3 URL in query for later use
101-
query.s3_url = s3_url
102-
else:
103-
# Store locally
104-
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
105-
with local_txt_file.open("w", encoding="utf-8") as f:
106-
f.write(digest_content)
107-
289+
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
108290
except Exception as exc:
109291
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
110292
return IngestErrorResponse(error=str(exc))
@@ -123,15 +305,7 @@ async def process_query(
123305
summary=summary,
124306
)
125307

126-
# Generate digest_url based on S3 configuration
127-
if is_s3_enabled():
128-
digest_url = getattr(query, "s3_url", None)
129-
if not digest_url:
130-
# This should not happen if S3 upload was successful
131-
msg = "S3 is enabled but no S3 URL was generated"
132-
raise RuntimeError(msg)
133-
else:
134-
digest_url = f"/api/download/file/{query.id}"
308+
digest_url = _generate_digest_url(query)
135309

136310
return IngestSuccessResponse(
137311
repo_url=input_text,

0 commit comments

Comments
 (0)