Skip to content

Commit debf757

Browse files
committed
perf: implement memory optimization for file processing and content handling
1 parent 576e9cc commit debf757

File tree

4 files changed

+291
-37
lines changed

4 files changed

+291
-37
lines changed

src/gitingest/entrypoint.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -260,10 +260,28 @@ async def _write_output(tree: str, content: str, target: str | None) -> None:
260260
The path to the output file. If ``None``, the results are not written to a file.
261261
262262
"""
263-
data = f"{tree}\n{content}"
264263
loop = asyncio.get_running_loop()
264+
265265
if target == "-":
266-
await loop.run_in_executor(None, sys.stdout.write, data)
266+
# Write to stdout in chunks to avoid large memory allocation
267+
await loop.run_in_executor(None, sys.stdout.write, tree)
268+
await loop.run_in_executor(None, sys.stdout.write, "\n")
269+
await loop.run_in_executor(None, sys.stdout.write, content)
267270
await loop.run_in_executor(None, sys.stdout.flush)
268271
elif target is not None:
269-
await loop.run_in_executor(None, Path(target).write_text, data, "utf-8")
272+
# Write to file in chunks to avoid large memory allocation
273+
target_path = Path(target)
274+
275+
# Define synchronous functions for file operations
276+
def write_tree() -> None:
277+
with target_path.open("w", encoding="utf-8") as f:
278+
f.write(tree)
279+
f.write("\n")
280+
281+
def append_content() -> None:
282+
with target_path.open("a", encoding="utf-8") as f:
283+
f.write(content)
284+
285+
# Execute file operations
286+
await loop.run_in_executor(None, write_tree)
287+
await loop.run_in_executor(None, append_content)

src/gitingest/ingestion.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1-
"""Functions to ingest and analyze a codebase directory or single file."""
1+
"""Functions to ingest and analyze a codebase directory or single file.
2+
3+
Memory optimization:
4+
- Lazy loading: File content is only loaded when accessed and cached to avoid repeated reads
5+
- Chunked reading: Large files are read in chunks to avoid loading everything at once
6+
- Content cache clearing: Periodically clears content cache to free memory during processing
7+
- Memory limits: Skips files that would cause excessive memory usage
8+
- Early termination: Stops processing when limits are reached
9+
"""
210

311
from __future__ import annotations
412

@@ -65,7 +73,12 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
6573
msg = f"File {file_node.name} has no content"
6674
raise ValueError(msg)
6775

68-
return format_node(file_node, query=query)
76+
result = format_node(file_node, query=query)
77+
78+
# Clear content cache to free memory
79+
file_node.clear_content_cache()
80+
81+
return result
6982

7083
root_node = FileSystemNode(
7184
name=path.name,
@@ -78,7 +91,12 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
7891

7992
_process_node(node=root_node, query=query, stats=stats)
8093

81-
return format_node(root_node, query=query)
94+
result = format_node(root_node, query=query)
95+
96+
# Clear content cache to free memory after formatting
97+
root_node.clear_content_cache()
98+
99+
return result
82100

83101

84102
def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None:
@@ -173,6 +191,8 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
173191
This function checks the file's size, increments the statistics, and reads its content.
174192
If the file size exceeds the maximum allowed, it raises an error.
175193
194+
Implements memory optimization by checking limits before processing files.
195+
176196
Parameters
177197
----------
178198
path : Path
@@ -194,6 +214,11 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
194214
print(f"Skipping file {path}: would exceed total size limit")
195215
return
196216

217+
# Skip very large files that would consume too much memory
218+
if file_size > MAX_TOTAL_SIZE_BYTES / 10: # Limit single file to 10% of total limit
219+
print(f"Skipping file {path}: file is too large for memory-efficient processing")
220+
return
221+
197222
stats.total_files += 1
198223
stats.total_size += file_size
199224

@@ -211,6 +236,12 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
211236
parent_node.size += file_size
212237
parent_node.file_count += 1
213238

239+
# If we've processed a lot of files, clear any cached content to free memory
240+
if stats.total_files % 100 == 0:
241+
for sibling in parent_node.children[:-10]: # Keep the 10 most recent files cached
242+
if sibling.type == FileSystemNodeType.FILE:
243+
sibling.clear_content_cache()
244+
214245

215246
def limit_exceeded(stats: FileSystemStats, depth: int) -> bool:
216247
"""Check if any of the traversal limits have been exceeded.

src/gitingest/output_formatter.py

Lines changed: 165 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
1-
"""Functions to ingest and analyze a codebase directory or single file."""
1+
"""Functions to ingest and analyze a codebase directory or single file.
2+
3+
Memory optimization:
4+
- Generator-based processing: Uses generators to process files one at a time
5+
- Streaming approach: Avoids loading all file contents into memory at once
6+
- Works with lazy loading: Complements the lazy loading in FileSystemNode
7+
"""
28

39
from __future__ import annotations
410

5-
from typing import TYPE_CHECKING
11+
import gc
12+
import io
13+
from typing import TYPE_CHECKING, Generator
614

715
import tiktoken
816

@@ -47,12 +55,45 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
4755

4856
tree = "Directory structure:\n" + _create_tree_structure(query, node=node)
4957

50-
content = _gather_file_contents(node)
58+
# Estimate tokens for tree
59+
tree_tokens = _count_tokens(tree)
60+
61+
# For token estimation, we need to sample some content
62+
# We'll use a small sample to estimate without loading everything
63+
content_sample = ""
64+
content_generator = _gather_file_contents(node)
65+
66+
# Try to get a small sample for token estimation
67+
try:
68+
# Get first item from generator for sampling
69+
first_item = next(content_generator)
70+
sample_size = min(len(first_item), 10000) # Limit sample size
71+
content_sample = first_item[:sample_size]
72+
except StopIteration:
73+
# No content
74+
pass
75+
76+
# Estimate tokens based on sample
77+
sample_tokens = _count_tokens(content_sample)
78+
79+
# If we have a sample, extrapolate total tokens based on file sizes
80+
if sample_tokens > 0 and len(content_sample) > 0:
81+
# Estimate tokens per byte
82+
tokens_per_byte = sample_tokens / len(content_sample)
83+
# Estimate total tokens based on total file size
84+
estimated_content_tokens = int(node.size * tokens_per_byte)
85+
total_tokens = tree_tokens + estimated_content_tokens
86+
else:
87+
total_tokens = tree_tokens
5188

52-
token_estimate = _format_token_count(tree + content)
89+
token_estimate = _format_token_count(total_tokens)
5390
if token_estimate:
5491
summary += f"\nEstimated tokens: {token_estimate}"
5592

93+
# For backward compatibility with tests, return content as a string
94+
# But use a more memory-efficient approach by processing files in chunks
95+
content = _gather_content_string(node)
96+
5697
return summary, tree, content
5798

5899

@@ -93,28 +134,115 @@ def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False)
93134
return "\n".join(parts) + "\n"
94135

95136

96-
def _gather_file_contents(node: FileSystemNode) -> str:
137+
def _gather_file_contents(node: FileSystemNode) -> Generator[str]:
97138
"""Recursively gather contents of all files under the given node.
98139
99-
This function recursively processes a directory node and gathers the contents of all files
100-
under that node. It returns the concatenated content of all files as a single string.
140+
This function recursively processes a directory node and yields the contents of all files
141+
under that node one at a time. Instead of concatenating all content into a single string,
142+
it returns a generator that yields each file's content separately.
143+
144+
The implementation is memory-efficient, processing one file at a time and using
145+
generators to avoid loading all content into memory at once.
101146
102147
Parameters
103148
----------
104149
node : FileSystemNode
105150
The current directory or file node being processed.
106151
152+
Yields
153+
------
154+
Generator[str]
155+
The content of each file as a string.
156+
157+
"""
158+
if node.type != FileSystemNodeType.DIRECTORY:
159+
yield node.content_string
160+
# Clear content cache immediately after yielding to free memory
161+
node.clear_content_cache()
162+
else:
163+
# Process one child at a time to avoid loading all content at once
164+
for child in node.children:
165+
yield from _gather_file_contents(child)
166+
167+
168+
def _gather_content_string(node: FileSystemNode) -> str:
169+
"""Gather file contents as a string, but in a memory-efficient way.
170+
171+
This function processes files in chunks to avoid loading all content into memory at once.
172+
It builds the content string incrementally, clearing file content caches as it goes.
173+
174+
For very large repositories, it uses a more aggressive chunking strategy to minimize memory usage.
175+
176+
Parameters
177+
----------
178+
node : FileSystemNode
179+
The file system node to process.
180+
107181
Returns
108182
-------
109183
str
110-
The concatenated content of all files under the given node.
184+
The combined content string.
111185
112186
"""
113-
if node.type != FileSystemNodeType.DIRECTORY:
114-
return node.content_string
115-
116-
# Recursively gather contents of all files under the current directory
117-
return "\n".join(_gather_file_contents(child) for child in node.children)
187+
# For very small repositories (less than 10MB), use simple approach
188+
if node.size < 10 * 1024 * 1024:
189+
content_chunks = list(_gather_file_contents(node))
190+
return "\n".join(content_chunks)
191+
192+
# For medium repositories (10MB to 100MB), use chunked approach
193+
if node.size < 100 * 1024 * 1024:
194+
# Use a list to accumulate content chunks
195+
content_chunks = []
196+
chunk_size = 0
197+
max_chunk_size = 5 * 1024 * 1024 # 5MB per chunk
198+
199+
# Process files in batches to limit memory usage
200+
for content_item in _gather_file_contents(node):
201+
content_chunks.append(content_item)
202+
chunk_size += len(content_item)
203+
204+
# If we've accumulated enough content, join it and reset
205+
if chunk_size >= max_chunk_size:
206+
# Join the current chunks
207+
joined_chunk = "\n".join(content_chunks)
208+
# Reset the chunks list with just the joined chunk
209+
content_chunks = [joined_chunk]
210+
# Update the chunk size
211+
chunk_size = len(joined_chunk)
212+
213+
# Join any remaining chunks
214+
return "\n".join(content_chunks)
215+
216+
# For large repositories (over 100MB), use a hybrid approach with StringIO
217+
# Use StringIO as a memory-efficient buffer
218+
buffer = io.StringIO()
219+
flush_interval = 100 # Flush to string every 100 files
220+
221+
# Process files and write to buffer
222+
for i, content_item in enumerate(_gather_file_contents(node)):
223+
buffer.write(content_item)
224+
buffer.write("\n")
225+
226+
# Periodically get the current value to avoid buffer growing too large
227+
if (i + 1) % flush_interval == 0:
228+
# Get current value
229+
current_value = buffer.getvalue()
230+
231+
# Reset buffer
232+
buffer.close()
233+
buffer = io.StringIO()
234+
235+
# Write current value back to buffer
236+
buffer.write(current_value)
237+
238+
# Force garbage collection to free memory
239+
gc.collect()
240+
241+
# Get final result
242+
result = buffer.getvalue()
243+
buffer.close()
244+
245+
return result
118246

119247

120248
def _create_tree_structure(
@@ -169,25 +297,43 @@ def _create_tree_structure(
169297
return tree_str
170298

171299

172-
def _format_token_count(text: str) -> str | None:
173-
"""Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
300+
def _count_tokens(text: str) -> int:
301+
"""Count the number of tokens in a text string.
174302
175303
Parameters
176304
----------
177305
text : str
178-
The text string for which the token count is to be estimated.
306+
The text string for which to count tokens.
179307
180308
Returns
181309
-------
182-
str | None
183-
The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs.
310+
int
311+
The number of tokens in the text, or 0 if an error occurs.
184312
185313
"""
186314
try:
187315
encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini
188-
total_tokens = len(encoding.encode(text, disallowed_special=()))
316+
return len(encoding.encode(text, disallowed_special=()))
189317
except (ValueError, UnicodeEncodeError) as exc:
190318
print(exc)
319+
return 0
320+
321+
322+
def _format_token_count(total_tokens: int) -> str | None:
323+
"""Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
324+
325+
Parameters
326+
----------
327+
total_tokens : int
328+
The number of tokens to format.
329+
330+
Returns
331+
-------
332+
str | None
333+
The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if total_tokens is 0.
334+
335+
"""
336+
if total_tokens == 0:
191337
return None
192338

193339
for threshold, suffix in _TOKEN_THRESHOLDS:

0 commit comments

Comments
 (0)