2
2
3
3
from __future__ import annotations
4
4
5
+ import logging
5
6
from pathlib import Path
6
- from typing import cast
7
+ from typing import TYPE_CHECKING , cast
7
8
8
9
from gitingest .clone import clone_repo
9
10
from gitingest .ingestion import ingest_query
10
11
from gitingest .query_parser import parse_remote_repo
11
- from gitingest .utils .git_utils import validate_github_token
12
+ from gitingest .utils .git_utils import resolve_commit , validate_github_token
12
13
from gitingest .utils .pattern_utils import process_patterns
13
- from server .models import IngestErrorResponse , IngestResponse , IngestSuccessResponse , PatternType
14
- from server .s3_utils import generate_s3_file_path , is_s3_enabled , upload_to_s3
14
+ from server .models import IngestErrorResponse , IngestResponse , IngestSuccessResponse , PatternType , S3Metadata
15
+ from server .s3_utils import (
16
+ _build_s3_url ,
17
+ check_s3_object_exists ,
18
+ generate_s3_file_path ,
19
+ get_metadata_from_s3 ,
20
+ is_s3_enabled ,
21
+ upload_metadata_to_s3 ,
22
+ upload_to_s3 ,
23
+ )
15
24
from server .server_config import MAX_DISPLAY_SIZE
16
25
from server .server_utils import Colors
17
26
27
+ if TYPE_CHECKING :
28
+ from gitingest .schemas .cloning import CloneConfig
29
+ from gitingest .schemas .ingestion import IngestionQuery
30
+
31
+ logger = logging .getLogger (__name__ )
32
+
33
+
34
+ async def _check_s3_cache (
35
+ query : IngestionQuery ,
36
+ input_text : str ,
37
+ max_file_size : int ,
38
+ pattern_type : str ,
39
+ pattern : str ,
40
+ token : str | None ,
41
+ ) -> IngestSuccessResponse | None :
42
+ """Check if digest already exists on S3 and return response if found.
43
+
44
+ Parameters
45
+ ----------
46
+ query : IngestionQuery
47
+ The parsed query object.
48
+ input_text : str
49
+ Original input text.
50
+ max_file_size : int
51
+ Maximum file size in KB.
52
+ pattern_type : str
53
+ Pattern type (include/exclude).
54
+ pattern : str
55
+ Pattern string.
56
+ token : str | None
57
+ GitHub token.
58
+
59
+ Returns
60
+ -------
61
+ IngestSuccessResponse | None
62
+ Response if file exists on S3, None otherwise.
63
+
64
+ """
65
+ if not is_s3_enabled ():
66
+ return None
67
+
68
+ try :
69
+ # Use git ls-remote to get commit SHA without cloning
70
+ clone_config = query .extract_clone_config ()
71
+ query .commit = await resolve_commit (clone_config , token = token )
72
+ # Generate S3 file path using the resolved commit
73
+ s3_file_path = generate_s3_file_path (
74
+ source = query .url ,
75
+ user_name = cast ("str" , query .user_name ),
76
+ repo_name = cast ("str" , query .repo_name ),
77
+ commit = query .commit ,
78
+ include_patterns = query .include_patterns ,
79
+ ignore_patterns = query .ignore_patterns ,
80
+ )
81
+
82
+ # Check if file exists on S3
83
+ if check_s3_object_exists (s3_file_path ):
84
+ # File exists on S3, serve it directly without cloning
85
+ s3_url = _build_s3_url (s3_file_path )
86
+ query .s3_url = s3_url
87
+
88
+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
89
+
90
+ # Try to get cached metadata
91
+ metadata = get_metadata_from_s3 (s3_file_path )
92
+
93
+ if metadata :
94
+ # Use cached metadata if available
95
+ summary = metadata .summary
96
+ tree = metadata .tree
97
+ content = metadata .content
98
+ else :
99
+ # Fallback to placeholder messages if metadata not available
100
+ summary = "Digest served from cache (S3). Download the full digest to see content details."
101
+ tree = "Digest served from cache. Download the full digest to see the file tree."
102
+ content = "Digest served from cache. Download the full digest to see the content."
103
+
104
+ return IngestSuccessResponse (
105
+ repo_url = input_text ,
106
+ short_repo_url = short_repo_url ,
107
+ summary = summary ,
108
+ digest_url = s3_url ,
109
+ tree = tree ,
110
+ content = content ,
111
+ default_max_file_size = max_file_size ,
112
+ pattern_type = pattern_type ,
113
+ pattern = pattern ,
114
+ )
115
+ except Exception as exc :
116
+ # Log the exception but don't fail the entire request
117
+ logger .warning ("S3 cache check failed, falling back to normal cloning: %s" , exc )
118
+
119
+ return None
120
+
121
+
122
+ def _store_digest_content (
123
+ query : IngestionQuery ,
124
+ clone_config : CloneConfig ,
125
+ digest_content : str ,
126
+ summary : str ,
127
+ tree : str ,
128
+ content : str ,
129
+ ) -> None :
130
+ """Store digest content either to S3 or locally based on configuration.
131
+
132
+ Parameters
133
+ ----------
134
+ query : IngestionQuery
135
+ The query object containing repository information.
136
+ clone_config : CloneConfig
137
+ The clone configuration object.
138
+ digest_content : str
139
+ The complete digest content to store.
140
+ summary : str
141
+ The summary content for metadata.
142
+ tree : str
143
+ The tree content for metadata.
144
+ content : str
145
+ The file content for metadata.
146
+
147
+ """
148
+ if is_s3_enabled ():
149
+ # Upload to S3 instead of storing locally
150
+ s3_file_path = generate_s3_file_path (
151
+ source = query .url ,
152
+ user_name = cast ("str" , query .user_name ),
153
+ repo_name = cast ("str" , query .repo_name ),
154
+ commit = query .commit ,
155
+ include_patterns = query .include_patterns ,
156
+ ignore_patterns = query .ignore_patterns ,
157
+ )
158
+ s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
159
+
160
+ # Also upload metadata JSON for caching
161
+ metadata = S3Metadata (
162
+ summary = summary ,
163
+ tree = tree ,
164
+ content = content ,
165
+ )
166
+ try :
167
+ upload_metadata_to_s3 (metadata = metadata , s3_file_path = s3_file_path , ingest_id = query .id )
168
+ logger .debug ("Successfully uploaded metadata to S3" )
169
+ except Exception as metadata_exc :
170
+ # Log the error but don't fail the entire request
171
+ logger .warning ("Failed to upload metadata to S3: %s" , metadata_exc )
172
+
173
+ # Store S3 URL in query for later use
174
+ query .s3_url = s3_url
175
+ else :
176
+ # Store locally
177
+ local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
178
+ with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
179
+ f .write (digest_content )
180
+
181
+
182
+ def _generate_digest_url (query : IngestionQuery ) -> str :
183
+ """Generate the digest URL based on S3 configuration.
184
+
185
+ Parameters
186
+ ----------
187
+ query : IngestionQuery
188
+ The query object containing repository information.
189
+
190
+ Returns
191
+ -------
192
+ str
193
+ The digest URL.
194
+
195
+ Raises
196
+ ------
197
+ RuntimeError
198
+ If S3 is enabled but no S3 URL was generated.
199
+
200
+ """
201
+ if is_s3_enabled ():
202
+ digest_url = getattr (query , "s3_url" , None )
203
+ if not digest_url :
204
+ # This should not happen if S3 upload was successful
205
+ msg = "S3 is enabled but no S3 URL was generated"
206
+ raise RuntimeError (msg )
207
+ return digest_url
208
+ return f"/api/download/file/{ query .id } "
209
+
18
210
19
211
async def process_query (
20
212
input_text : str ,
@@ -69,10 +261,22 @@ async def process_query(
69
261
include_patterns = pattern if pattern_type == PatternType .INCLUDE else None ,
70
262
)
71
263
264
+ # Check if digest already exists on S3 before cloning
265
+ s3_response = await _check_s3_cache (
266
+ query = query ,
267
+ input_text = input_text ,
268
+ max_file_size = max_file_size ,
269
+ pattern_type = pattern_type .value ,
270
+ pattern = pattern ,
271
+ token = token ,
272
+ )
273
+ if s3_response :
274
+ return s3_response
275
+
72
276
clone_config = query .extract_clone_config ()
73
277
await clone_repo (clone_config , token = token )
74
278
75
- short_repo_url = f"{ query .user_name } /{ query .repo_name } " # Sets the "<user>/<repo>" for the page title
279
+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
76
280
77
281
# The commit hash should always be available at this point
78
282
if not query .commit :
@@ -81,30 +285,8 @@ async def process_query(
81
285
82
286
try :
83
287
summary , tree , content = ingest_query (query )
84
-
85
- # Prepare the digest content (tree + content)
86
288
digest_content = tree + "\n " + content
87
-
88
- # Store digest based on S3 configuration
89
- if is_s3_enabled ():
90
- # Upload to S3 instead of storing locally
91
- s3_file_path = generate_s3_file_path (
92
- source = query .url ,
93
- user_name = cast ("str" , query .user_name ),
94
- repo_name = cast ("str" , query .repo_name ),
95
- commit = query .commit ,
96
- include_patterns = query .include_patterns ,
97
- ignore_patterns = query .ignore_patterns ,
98
- )
99
- s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
100
- # Store S3 URL in query for later use
101
- query .s3_url = s3_url
102
- else :
103
- # Store locally
104
- local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
105
- with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
106
- f .write (digest_content )
107
-
289
+ _store_digest_content (query , clone_config , digest_content , summary , tree , content )
108
290
except Exception as exc :
109
291
_print_error (query .url , exc , max_file_size , pattern_type , pattern )
110
292
return IngestErrorResponse (error = str (exc ))
@@ -123,15 +305,7 @@ async def process_query(
123
305
summary = summary ,
124
306
)
125
307
126
- # Generate digest_url based on S3 configuration
127
- if is_s3_enabled ():
128
- digest_url = getattr (query , "s3_url" , None )
129
- if not digest_url :
130
- # This should not happen if S3 upload was successful
131
- msg = "S3 is enabled but no S3 URL was generated"
132
- raise RuntimeError (msg )
133
- else :
134
- digest_url = f"/api/download/file/{ query .id } "
308
+ digest_url = _generate_digest_url (query )
135
309
136
310
return IngestSuccessResponse (
137
311
repo_url = input_text ,
0 commit comments