|
1 |
| -"""Functions to ingest and analyze a codebase directory or single file.""" |
| 1 | +"""Functions to ingest and analyze a codebase directory or single file. |
| 2 | +
|
| 3 | +Memory optimization: |
| 4 | +- Generator-based processing: Uses generators to process files one at a time |
| 5 | +- Streaming approach: Avoids loading all file contents into memory at once |
| 6 | +- Works with lazy loading: Complements the lazy loading in FileSystemNode |
| 7 | +""" |
2 | 8 |
|
3 | 9 | from __future__ import annotations
|
4 | 10 |
|
5 |
| -from typing import TYPE_CHECKING |
| 11 | +import gc |
| 12 | +import io |
| 13 | +from typing import TYPE_CHECKING, Generator |
6 | 14 |
|
7 | 15 | import tiktoken
|
8 | 16 |
|
@@ -47,12 +55,45 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
|
47 | 55 |
|
48 | 56 | tree = "Directory structure:\n" + _create_tree_structure(query, node=node)
|
49 | 57 |
|
50 |
| - content = _gather_file_contents(node) |
| 58 | + # Estimate tokens for tree |
| 59 | + tree_tokens = _count_tokens(tree) |
| 60 | + |
| 61 | + # For token estimation, we need to sample some content |
| 62 | + # We'll use a small sample to estimate without loading everything |
| 63 | + content_sample = "" |
| 64 | + content_generator = _gather_file_contents(node) |
| 65 | + |
| 66 | + # Try to get a small sample for token estimation |
| 67 | + try: |
| 68 | + # Get first item from generator for sampling |
| 69 | + first_item = next(content_generator) |
| 70 | + sample_size = min(len(first_item), 10000) # Limit sample size |
| 71 | + content_sample = first_item[:sample_size] |
| 72 | + except StopIteration: |
| 73 | + # No content |
| 74 | + pass |
| 75 | + |
| 76 | + # Estimate tokens based on sample |
| 77 | + sample_tokens = _count_tokens(content_sample) |
| 78 | + |
| 79 | + # If we have a sample, extrapolate total tokens based on file sizes |
| 80 | + if sample_tokens > 0 and len(content_sample) > 0: |
| 81 | + # Estimate tokens per byte |
| 82 | + tokens_per_byte = sample_tokens / len(content_sample) |
| 83 | + # Estimate total tokens based on total file size |
| 84 | + estimated_content_tokens = int(node.size * tokens_per_byte) |
| 85 | + total_tokens = tree_tokens + estimated_content_tokens |
| 86 | + else: |
| 87 | + total_tokens = tree_tokens |
51 | 88 |
|
52 |
| - token_estimate = _format_token_count(tree + content) |
| 89 | + token_estimate = _format_token_count(total_tokens) |
53 | 90 | if token_estimate:
|
54 | 91 | summary += f"\nEstimated tokens: {token_estimate}"
|
55 | 92 |
|
| 93 | + # For backward compatibility with tests, return content as a string |
| 94 | + # But use a more memory-efficient approach by processing files in chunks |
| 95 | + content = _gather_content_string(node) |
| 96 | + |
56 | 97 | return summary, tree, content
|
57 | 98 |
|
58 | 99 |
|
@@ -93,28 +134,115 @@ def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False)
|
93 | 134 | return "\n".join(parts) + "\n"
|
94 | 135 |
|
95 | 136 |
|
96 |
| -def _gather_file_contents(node: FileSystemNode) -> str: |
| 137 | +def _gather_file_contents(node: FileSystemNode) -> Generator[str]: |
97 | 138 | """Recursively gather contents of all files under the given node.
|
98 | 139 |
|
99 |
| - This function recursively processes a directory node and gathers the contents of all files |
100 |
| - under that node. It returns the concatenated content of all files as a single string. |
| 140 | + This function recursively processes a directory node and yields the contents of all files |
| 141 | + under that node one at a time. Instead of concatenating all content into a single string, |
| 142 | + it returns a generator that yields each file's content separately. |
| 143 | +
|
| 144 | + The implementation is memory-efficient, processing one file at a time and using |
| 145 | + generators to avoid loading all content into memory at once. |
101 | 146 |
|
102 | 147 | Parameters
|
103 | 148 | ----------
|
104 | 149 | node : FileSystemNode
|
105 | 150 | The current directory or file node being processed.
|
106 | 151 |
|
| 152 | + Yields |
| 153 | + ------ |
| 154 | + Generator[str] |
| 155 | + The content of each file as a string. |
| 156 | +
|
| 157 | + """ |
| 158 | + if node.type != FileSystemNodeType.DIRECTORY: |
| 159 | + yield node.content_string |
| 160 | + # Clear content cache immediately after yielding to free memory |
| 161 | + node.clear_content_cache() |
| 162 | + else: |
| 163 | + # Process one child at a time to avoid loading all content at once |
| 164 | + for child in node.children: |
| 165 | + yield from _gather_file_contents(child) |
| 166 | + |
| 167 | + |
| 168 | +def _gather_content_string(node: FileSystemNode) -> str: |
| 169 | + """Gather file contents as a string, but in a memory-efficient way. |
| 170 | +
|
| 171 | + This function processes files in chunks to avoid loading all content into memory at once. |
| 172 | + It builds the content string incrementally, clearing file content caches as it goes. |
| 173 | +
|
| 174 | + For very large repositories, it uses a more aggressive chunking strategy to minimize memory usage. |
| 175 | +
|
| 176 | + Parameters |
| 177 | + ---------- |
| 178 | + node : FileSystemNode |
| 179 | + The file system node to process. |
| 180 | +
|
107 | 181 | Returns
|
108 | 182 | -------
|
109 | 183 | str
|
110 |
| - The concatenated content of all files under the given node. |
| 184 | + The combined content string. |
111 | 185 |
|
112 | 186 | """
|
113 |
| - if node.type != FileSystemNodeType.DIRECTORY: |
114 |
| - return node.content_string |
115 |
| - |
116 |
| - # Recursively gather contents of all files under the current directory |
117 |
| - return "\n".join(_gather_file_contents(child) for child in node.children) |
| 187 | + # For very small repositories (less than 10MB), use simple approach |
| 188 | + if node.size < 10 * 1024 * 1024: |
| 189 | + content_chunks = list(_gather_file_contents(node)) |
| 190 | + return "\n".join(content_chunks) |
| 191 | + |
| 192 | + # For medium repositories (10MB to 100MB), use chunked approach |
| 193 | + if node.size < 100 * 1024 * 1024: |
| 194 | + # Use a list to accumulate content chunks |
| 195 | + content_chunks = [] |
| 196 | + chunk_size = 0 |
| 197 | + max_chunk_size = 5 * 1024 * 1024 # 5MB per chunk |
| 198 | + |
| 199 | + # Process files in batches to limit memory usage |
| 200 | + for content_item in _gather_file_contents(node): |
| 201 | + content_chunks.append(content_item) |
| 202 | + chunk_size += len(content_item) |
| 203 | + |
| 204 | + # If we've accumulated enough content, join it and reset |
| 205 | + if chunk_size >= max_chunk_size: |
| 206 | + # Join the current chunks |
| 207 | + joined_chunk = "\n".join(content_chunks) |
| 208 | + # Reset the chunks list with just the joined chunk |
| 209 | + content_chunks = [joined_chunk] |
| 210 | + # Update the chunk size |
| 211 | + chunk_size = len(joined_chunk) |
| 212 | + |
| 213 | + # Join any remaining chunks |
| 214 | + return "\n".join(content_chunks) |
| 215 | + |
| 216 | + # For large repositories (over 100MB), use a hybrid approach with StringIO |
| 217 | + # Use StringIO as a memory-efficient buffer |
| 218 | + buffer = io.StringIO() |
| 219 | + flush_interval = 100 # Flush to string every 100 files |
| 220 | + |
| 221 | + # Process files and write to buffer |
| 222 | + for i, content_item in enumerate(_gather_file_contents(node)): |
| 223 | + buffer.write(content_item) |
| 224 | + buffer.write("\n") |
| 225 | + |
| 226 | + # Periodically get the current value to avoid buffer growing too large |
| 227 | + if (i + 1) % flush_interval == 0: |
| 228 | + # Get current value |
| 229 | + current_value = buffer.getvalue() |
| 230 | + |
| 231 | + # Reset buffer |
| 232 | + buffer.close() |
| 233 | + buffer = io.StringIO() |
| 234 | + |
| 235 | + # Write current value back to buffer |
| 236 | + buffer.write(current_value) |
| 237 | + |
| 238 | + # Force garbage collection to free memory |
| 239 | + gc.collect() |
| 240 | + |
| 241 | + # Get final result |
| 242 | + result = buffer.getvalue() |
| 243 | + buffer.close() |
| 244 | + |
| 245 | + return result |
118 | 246 |
|
119 | 247 |
|
120 | 248 | def _create_tree_structure(
|
@@ -169,25 +297,43 @@ def _create_tree_structure(
|
169 | 297 | return tree_str
|
170 | 298 |
|
171 | 299 |
|
172 |
| -def _format_token_count(text: str) -> str | None: |
173 |
| - """Return a human-readable token-count string (e.g. 1.2k, 1.2 M). |
| 300 | +def _count_tokens(text: str) -> int: |
| 301 | + """Count the number of tokens in a text string. |
174 | 302 |
|
175 | 303 | Parameters
|
176 | 304 | ----------
|
177 | 305 | text : str
|
178 |
| - The text string for which the token count is to be estimated. |
| 306 | + The text string for which to count tokens. |
179 | 307 |
|
180 | 308 | Returns
|
181 | 309 | -------
|
182 |
| - str | None |
183 |
| - The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs. |
| 310 | + int |
| 311 | + The number of tokens in the text, or 0 if an error occurs. |
184 | 312 |
|
185 | 313 | """
|
186 | 314 | try:
|
187 | 315 | encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini
|
188 |
| - total_tokens = len(encoding.encode(text, disallowed_special=())) |
| 316 | + return len(encoding.encode(text, disallowed_special=())) |
189 | 317 | except (ValueError, UnicodeEncodeError) as exc:
|
190 | 318 | print(exc)
|
| 319 | + return 0 |
| 320 | + |
| 321 | + |
| 322 | +def _format_token_count(total_tokens: int) -> str | None: |
| 323 | + """Return a human-readable token-count string (e.g. 1.2k, 1.2 M). |
| 324 | +
|
| 325 | + Parameters |
| 326 | + ---------- |
| 327 | + total_tokens : int |
| 328 | + The number of tokens to format. |
| 329 | +
|
| 330 | + Returns |
| 331 | + ------- |
| 332 | + str | None |
| 333 | + The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if total_tokens is 0. |
| 334 | +
|
| 335 | + """ |
| 336 | + if total_tokens == 0: |
191 | 337 | return None
|
192 | 338 |
|
193 | 339 | for threshold, suffix in _TOKEN_THRESHOLDS:
|
|
0 commit comments