Skip to content

Commit a99089a

Browse files
refactor: consistent cloning & pattern-handling (#388)
Co-authored-by: ix-56h <n.guintini@protonmail.com>
1 parent 360a38e commit a99089a

37 files changed

+1022
-865
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
- name: Install dependencies
5252
run: |
5353
python -m pip install --upgrade pip
54-
python -m pip install ".[dev]"
54+
python -m pip install ".[dev,server]"
5555
5656
- name: Run tests
5757
if: ${{ matrix.coverage != true }}

.github/workflows/publish_to_pypi.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Publish to PyPI
22

33
on:
44
release:
5-
types: [created] # Run when you click Publish release
5+
types: [created] # Run when you click "Publish release"
66
workflow_dispatch: # ... or run it manually from the Actions tab
77

88
permissions:
@@ -38,7 +38,7 @@ jobs:
3838
name: dist
3939
path: dist/
4040

41-
# Publish to PyPI (only if dist/ succeeded)
41+
# Publish to PyPI (only if "dist/" succeeded)
4242
pypi-publish:
4343
needs: release-build
4444
runs-on: ubuntu-latest

.pre-commit-config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,12 @@ repos:
122122
pytest-asyncio,
123123
pytest-mock,
124124
python-dotenv,
125+
'sentry-sdk[fastapi]',
125126
slowapi,
126127
starlette>=0.40.0,
128+
strenum; python_version < '3.11',
127129
tiktoken>=0.7.0,
130+
typing_extensions>= 4.0.0; python_version < '3.10',
128131
uvicorn>=0.11.7,
129132
]
130133

@@ -144,9 +147,12 @@ repos:
144147
pytest-asyncio,
145148
pytest-mock,
146149
python-dotenv,
150+
'sentry-sdk[fastapi]',
147151
slowapi,
148152
starlette>=0.40.0,
153+
strenum; python_version < '3.11',
149154
tiktoken>=0.7.0,
155+
typing_extensions>= 4.0.0; python_version < '3.10',
150156
uvicorn>=0.11.7,
151157
]
152158

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK
3232
```bash
3333
python -m venv .venv
3434
source .venv/bin/activate
35-
pip install -e ".[dev]"
35+
pip install -e ".[dev,server]"
3636
pre-commit install
3737
```
3838

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,14 @@ You can install it using `pip`:
6666
pip install gitingest
6767
```
6868

69+
or
70+
71+
```bash
72+
pip install gitingest[server]
73+
```
74+
75+
to include server dependencies for self-hosting.
76+
6977
However, it might be a good idea to use `pipx` to install it.
7078
You can install `pipx` using your preferred package manager.
7179

pyproject.toml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,14 @@ readme = {file = "README.md", content-type = "text/markdown" }
66
requires-python = ">= 3.8"
77
dependencies = [
88
"click>=8.0.0",
9-
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
109
"httpx",
1110
"pathspec>=0.12.1",
1211
"pydantic",
1312
"python-dotenv",
14-
"slowapi",
1513
"starlette>=0.40.0", # Minimum safe release (https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw)
14+
"strenum; python_version < '3.11'",
1615
"tiktoken>=0.7.0", # Support for o200k_base encoding
1716
"typing_extensions>= 4.0.0; python_version < '3.10'",
18-
"uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)
19-
"prometheus-client",
2017
]
2118

2219
license = {file = "LICENSE"}
@@ -46,6 +43,14 @@ dev = [
4643
"pytest-mock",
4744
]
4845

46+
server = [
47+
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
48+
"prometheus-client",
49+
"sentry-sdk[fastapi]",
50+
"slowapi",
51+
"uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)
52+
]
53+
4954
[project.scripts]
5055
gitingest = "gitingest.__main__:main"
5156

src/gitingest/__init__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
"""Gitingest: A package for ingesting data from Git repositories."""
22

3-
from gitingest.clone import clone_repo
43
from gitingest.entrypoint import ingest, ingest_async
5-
from gitingest.ingestion import ingest_query
6-
from gitingest.query_parser import parse_query
74

8-
__all__ = ["clone_repo", "ingest", "ingest_async", "ingest_query", "parse_query"]
5+
__all__ = ["ingest", "ingest_async"]

src/gitingest/clone.py

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
from gitingest.config import DEFAULT_TIMEOUT
99
from gitingest.utils.git_utils import (
1010
check_repo_exists,
11+
checkout_partial_clone,
1112
create_git_auth_header,
1213
create_git_command,
1314
ensure_git_installed,
1415
is_github_host,
16+
resolve_commit,
1517
run_command,
1618
)
17-
from gitingest.utils.os_utils import ensure_directory
19+
from gitingest.utils.os_utils import ensure_directory_exists_or_create
1820
from gitingest.utils.timeout_wrapper import async_timeout
1921

2022
if TYPE_CHECKING:
@@ -45,71 +47,42 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
4547
# Extract and validate query parameters
4648
url: str = config.url
4749
local_path: str = config.local_path
48-
commit: str | None = config.commit
49-
branch: str | None = config.branch
50-
tag: str | None = config.tag
5150
partial_clone: bool = config.subpath != "/"
5251

53-
# Create parent directory if it doesn't exist
54-
await ensure_directory(Path(local_path).parent)
52+
await ensure_git_installed()
53+
await ensure_directory_exists_or_create(Path(local_path).parent)
5554

56-
# Check if the repository exists
5755
if not await check_repo_exists(url, token=token):
5856
msg = "Repository not found. Make sure it is public or that you have provided a valid token."
5957
raise ValueError(msg)
6058

59+
commit = await resolve_commit(config, token=token)
60+
6161
clone_cmd = ["git"]
6262
if token and is_github_host(url):
6363
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
6464

65-
clone_cmd += ["clone", "--single-branch"]
66-
67-
if config.include_submodules:
68-
clone_cmd += ["--recurse-submodules"]
69-
65+
clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"]
7066
if partial_clone:
7167
clone_cmd += ["--filter=blob:none", "--sparse"]
7268

73-
# Shallow clone unless a specific commit is requested
74-
if not commit:
75-
clone_cmd += ["--depth=1"]
76-
77-
# Prefer tag over branch when both are provided
78-
if tag:
79-
clone_cmd += ["--branch", tag]
80-
elif branch and branch.lower() not in ("main", "master"):
81-
clone_cmd += ["--branch", branch]
82-
8369
clone_cmd += [url, local_path]
8470

8571
# Clone the repository
86-
await ensure_git_installed()
8772
await run_command(*clone_cmd)
8873

8974
# Checkout the subpath if it is a partial clone
9075
if partial_clone:
91-
await _checkout_partial_clone(config, token)
76+
await checkout_partial_clone(config, token=token)
9277

93-
# Checkout the commit if it is provided
94-
if commit:
95-
checkout_cmd = create_git_command(["git"], local_path, url, token)
96-
await run_command(*checkout_cmd, "checkout", commit)
78+
git = create_git_command(["git"], local_path, url, token)
9779

80+
# Ensure the commit is locally available
81+
await run_command(*git, "fetch", "--depth=1", "origin", commit)
9882

99-
async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
100-
"""Configure sparse-checkout for a partially cloned repository.
83+
# Write the work-tree at that commit
84+
await run_command(*git, "checkout", commit)
10185

102-
Parameters
103-
----------
104-
config : CloneConfig
105-
The configuration for cloning the repository, including subpath and blob flag.
106-
token : str | None
107-
GitHub personal access token (PAT) for accessing private repositories.
108-
109-
"""
110-
subpath = config.subpath.lstrip("/")
111-
if config.blob:
112-
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
113-
subpath = str(Path(subpath).parent.as_posix())
114-
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
115-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
86+
# Update submodules
87+
if config.include_submodules:
88+
await run_command(*git, "submodule", "update", "--init", "--recursive", "--depth=1")

src/gitingest/entrypoint.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,30 @@
33
from __future__ import annotations
44

55
import asyncio
6+
import errno
67
import shutil
8+
import stat
79
import sys
810
import warnings
911
from contextlib import asynccontextmanager
1012
from pathlib import Path
11-
from typing import AsyncGenerator
13+
from typing import TYPE_CHECKING, AsyncGenerator, Callable
14+
from urllib.parse import urlparse
1215

1316
from gitingest.clone import clone_repo
1417
from gitingest.config import MAX_FILE_SIZE
1518
from gitingest.ingestion import ingest_query
16-
from gitingest.query_parser import IngestionQuery, parse_query
19+
from gitingest.query_parser import parse_local_dir_path, parse_remote_repo
1720
from gitingest.utils.auth import resolve_token
21+
from gitingest.utils.compat_func import removesuffix
1822
from gitingest.utils.ignore_patterns import load_ignore_patterns
23+
from gitingest.utils.pattern_utils import process_patterns
24+
from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS
25+
26+
if TYPE_CHECKING:
27+
from types import TracebackType
28+
29+
from gitingest.schemas import IngestionQuery
1930

2031

2132
async def ingest_async(
@@ -74,13 +85,23 @@ async def ingest_async(
7485
"""
7586
token = resolve_token(token)
7687

77-
query: IngestionQuery = await parse_query(
78-
source=source,
79-
max_file_size=max_file_size,
80-
from_web=False,
88+
source = removesuffix(source.strip(), ".git")
89+
90+
# Determine the parsing method based on the source type
91+
if urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
92+
# We either have a full URL or a domain-less slug
93+
query = await parse_remote_repo(source, token=token)
94+
query.include_submodules = include_submodules
95+
_override_branch_and_tag(query, branch=branch, tag=tag)
96+
97+
else:
98+
# Local path scenario
99+
query = parse_local_dir_path(source)
100+
101+
query.max_file_size = max_file_size
102+
query.ignore_patterns, query.include_patterns = process_patterns(
103+
exclude_patterns=exclude_patterns,
81104
include_patterns=include_patterns,
82-
ignore_patterns=exclude_patterns,
83-
token=token,
84105
)
85106

86107
if query.url:
@@ -235,17 +256,49 @@ async def _clone_repo_if_remote(query: IngestionQuery, *, token: str | None) ->
235256
GitHub personal access token (PAT) for accessing private repositories.
236257
237258
"""
259+
kwargs = {}
260+
if sys.version_info >= (3, 12):
261+
kwargs["onexc"] = _handle_remove_readonly
262+
else:
263+
kwargs["onerror"] = _handle_remove_readonly
264+
238265
if query.url:
239266
clone_config = query.extract_clone_config()
240267
await clone_repo(clone_config, token=token)
241268
try:
242269
yield
243270
finally:
244-
shutil.rmtree(query.local_path.parent)
271+
shutil.rmtree(query.local_path.parent, **kwargs)
245272
else:
246273
yield
247274

248275

276+
def _handle_remove_readonly(
277+
func: Callable,
278+
path: str,
279+
exc_info: BaseException | tuple[type[BaseException], BaseException, TracebackType],
280+
) -> None:
281+
"""Handle permission errors raised by ``shutil.rmtree()``.
282+
283+
* Makes the target writable (removes the read-only attribute).
284+
* Retries the original operation (``func``) once.
285+
286+
"""
287+
# 'onerror' passes a (type, value, tb) tuple; 'onexc' passes the exception
288+
if isinstance(exc_info, tuple): # 'onerror' (Python <3.12)
289+
exc: BaseException = exc_info[1]
290+
else: # 'onexc' (Python 3.12+)
291+
exc = exc_info
292+
293+
# Handle only'Permission denied' and 'Operation not permitted'
294+
if not isinstance(exc, OSError) or exc.errno not in {errno.EACCES, errno.EPERM}:
295+
raise exc
296+
297+
# Make the target writable
298+
Path(path).chmod(stat.S_IWRITE)
299+
func(path)
300+
301+
249302
async def _write_output(tree: str, content: str, target: str | None) -> None:
250303
"""Write combined output to ``target`` (``"-"`` ⇒ stdout).
251304

src/gitingest/ingestion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
1212

1313
if TYPE_CHECKING:
14-
from gitingest.query_parser import IngestionQuery
14+
from gitingest.schemas import IngestionQuery
1515

1616

1717
def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:

0 commit comments

Comments
 (0)