diff --git a/tests/server/test_flow_integration.py b/tests/server/test_flow_integration.py index ce8ec284..ad40dfbc 100644 --- a/tests/server/test_flow_integration.py +++ b/tests/server/test_flow_integration.py @@ -1,5 +1,6 @@ """Integration tests covering core functionalities, edge cases, and concurrency handling.""" +import re import shutil import sys from concurrent.futures import ThreadPoolExecutor @@ -93,9 +94,8 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None: @pytest.mark.asyncio async def test_large_repository(request: pytest.FixtureRequest) -> None: - """Simulate analysis of a large repository with nested folders.""" + """Simulate analysis of a large repository with nested folders and many files.""" client = request.getfixturevalue("test_client") - # TODO: ingesting a large repo take too much time (eg: godotengine/godot repository) form_data = { "input_text": "https://github.com/octocat/hello-world", "max_file_size": 10, @@ -110,7 +110,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None: response_data = response.json() if response.status_code == status.HTTP_200_OK: assert "content" in response_data - assert response_data["content"] + assert isinstance(response_data["content"], str) else: assert "error" in response_data @@ -169,25 +169,144 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None: @pytest.mark.asyncio async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: - """Test repository analysis with include/exclude patterns.""" + """Test repository analysis using include patterns on a real GitHub repo.""" client = request.getfixturevalue("test_client") + + # Target repository and file pattern + repo_url = "https://github.com/pallets/flask" + pattern = "*.md" + form_data = { - "input_text": "https://github.com/octocat/Hello-World", + "input_text": repo_url, "max_file_size": 243, "pattern_type": "include", - "pattern": "*.md", + "pattern": pattern, "token": "", } response = client.post("/api/ingest", json=form_data) - assert response.status_code == status.HTTP_200_OK, f"Request failed: {response.text}" + assert response.status_code == status.HTTP_200_OK, f"Expected 200 OK, got {response.status_code}: {response.text}" response_data = response.json() - if response.status_code == status.HTTP_200_OK: - assert "content" in response_data - assert "pattern_type" in response_data - assert response_data["pattern_type"] == "include" - assert "pattern" in response_data - assert response_data["pattern"] == "*.md" - else: - assert "error" in response_data + assert isinstance(response_data, dict), "Response is not a JSON object" + + # Ruff-compliant assertions + assert "content" in response_data, "Missing 'content' in response" + assert isinstance(response_data["content"], str), "'content' is not a string" + + assert "repo_url" in response_data, "Missing 'repo_url'" + assert response_data["repo_url"].startswith("https://github.com/"), ( + "'repo_url' does not start with expected prefix" + ) + + assert "summary" in response_data, "Missing 'summary'" + assert isinstance(response_data["summary"], str), "'summary' is not a string" + + assert "tree" in response_data, "Missing 'tree'" + assert isinstance(response_data["tree"], str), "'tree' is not a string" + + assert "pattern_type" in response_data, "Missing 'pattern_type'" + assert response_data["pattern_type"] == "include", "Unexpected 'pattern_type' value" + + assert "pattern" in response_data, "Missing 'pattern'" + assert response_data["pattern"] == pattern, "Unexpected 'pattern' value" + + # Dynamically validate repo name + repo_slug = re.sub(r"https://github\.com/", "", repo_url).lower() + assert repo_slug in response_data["summary"].lower(), f"Expected repo slug '{repo_slug}' in summary" + assert repo_slug.replace("/", "-") in response_data["tree"].lower(), f"Expected slug '{repo_slug}' in tree" + + +@pytest.mark.asyncio +async def test_missing_required_fields(request: pytest.FixtureRequest) -> None: + """Test API response when required fields are missing.""" + client = request.getfixturevalue("test_client") + form_data = { + "max_file_size": "200", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code in ( + status.HTTP_422_UNPROCESSABLE_ENTITY, + status.HTTP_429_TOO_MANY_REQUESTS, + ) + + form_data = { + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", + "pattern": "", + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code in ( + status.HTTP_422_UNPROCESSABLE_ENTITY, + status.HTTP_429_TOO_MANY_REQUESTS, + status.HTTP_200_OK, + ) + + +@pytest.mark.asyncio +async def test_invalid_field_types(request: pytest.FixtureRequest) -> None: + """Test API response when fields have invalid types.""" + client = request.getfixturevalue("test_client") + + form_data = { + "input_text": 12345, + "max_file_size": "200", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + form_data = { + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", + "pattern_type": "exclude", + "pattern": ["*.md"], + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +@pytest.mark.asyncio +async def test_unsupported_pattern_type(request: pytest.FixtureRequest) -> None: + """Test API response for unsupported pattern_type.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", + "pattern_type": "invalid_type", + "pattern": "*.md", + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code in (status.HTTP_400_BAD_REQUEST, status.HTTP_422_UNPROCESSABLE_ENTITY) + response_data = response.json() + assert "error" in response_data or "detail" in response_data + + +@pytest.mark.asyncio +async def test_invalid_token(request: pytest.FixtureRequest) -> None: + """Test API response for an invalid or expired token.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", + "pattern_type": "exclude", + "pattern": "", + "token": "invalid_token_1234567890", + } + response = client.post("/api/ingest", json=form_data) + # Accept all likely error codes for invalid token + assert response.status_code in ( + status.HTTP_401_UNAUTHORIZED, + status.HTTP_400_BAD_REQUEST, + status.HTTP_429_TOO_MANY_REQUESTS, + ), f"Unexpected status code: {response.status_code}" + response_data = response.json() + assert "error" in response_data or "detail" in response_data diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py index 48408130..3eafc3f5 100644 --- a/tests/test_git_utils.py +++ b/tests/test_git_utils.py @@ -277,3 +277,30 @@ def test_create_git_command_ignores_non_github_urls( # Should only have base command and -C option, no auth headers expected = [*base_cmd, "-C", local_path] assert cmd == expected + + +@pytest.mark.parametrize( + "url", + [ + "", + "not-a-url", + "ftp://github.com/owner/repo.git", + "github.com/owner/repo.git", + "https://", + ], +) +def test_is_github_host_edge_cases(url: str) -> None: + """Test is_github_host with malformed or edge-case URLs.""" + try: + result = is_github_host(url) + assert isinstance(result, bool) + except (ValueError, TypeError) as exc: + pytest.fail(f"is_github_host raised {exc.__class__.__name__} for url: {url}") + + +def test_token_not_in_command_plaintext() -> None: + """Ensure the token is not present in the command as plain text.""" + token = "ghp_" + "x" * 36 + cmd = create_git_command(["git", "clone"], "/tmp", "https://github.com/owner/repo.git", token) + for part in cmd: + assert token not in part or "Basic" in part diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index f3585e05..f51b408c 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -46,9 +46,6 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> assert "dir2/file_dir2.txt" in content -# TODO: Additional tests: -# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. -# - Edge cases with weird file names or deep subdirectory structures. # TODO : def test_include_nonexistent_extension @@ -222,14 +219,256 @@ def test_include_ignore_patterns( assert (num_files_match := num_files_regex.search(summary)) is not None assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"] - # Check presence of key files in the content for expected_content_item in pattern_scenario["expected_content"]: assert expected_content_item in content - # check presence of included directories in structure for expected_structure_item in pattern_scenario["expected_structure"]: assert expected_structure_item in structure - # check non-presence of non-included directories in structure for expected_not_structure_item in pattern_scenario["expected_not_structure"]: assert expected_not_structure_item not in structure + + +def test_ingest_skips_binary_files(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that binary files are not included as raw content, but as a marker.""" + binary_file = temp_directory / "binary.bin" + binary_file.write_bytes(b"\x00\xff\x00\xff") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "binary.bin" in content + assert "[Binary file]" in content + assert b"\x00\xff\x00\xff".decode(errors="ignore") not in content + + +def test_ingest_skips_symlinks(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that symlinks are not included as file content, but as a marker.""" + target_file = temp_directory / "file1.txt" + target_file.write_text("hello") + symlink = temp_directory / "symlink.txt" + symlink.symlink_to(target_file) + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "symlink.txt" in content + assert "SYMLINK: symlink.txt" in content + assert "hello" not in content.split("SYMLINK: symlink.txt")[1] + + +def test_symlink_loop(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that symlink loops do not cause infinite recursion.""" + loop_dir = temp_directory / "loop" + loop_dir.mkdir() + (loop_dir / "file.txt").write_text("loop file") + # Create a symlink inside loop_dir pointing to its parent + (loop_dir / "parent_link").symlink_to(temp_directory) + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + _, _, content = ingest_query(sample_query) + assert "file.txt" in content + + +def test_ingest_large_file_handling(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that files exceeding max_file_size are skipped.""" + large_file = temp_directory / "large.txt" + large_file.write_text("A" * (sample_query.max_file_size + 1)) + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "large.txt" not in content, "Large files should be skipped from content." + + +def test_ingest_hidden_files(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that hidden files are handled according to ignore/include patterns.""" + hidden_file = temp_directory / ".hidden.txt" + hidden_file.write_text("secret") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.ignore_patterns = {".hidden.txt"} + + summary, _, content = ingest_query(sample_query) + assert ".hidden.txt" not in content + assert ".hidden.txt" not in summary + + +def test_ingest_empty_file(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that empty files are included but content is empty.""" + empty_file = temp_directory / "empty.txt" + empty_file.write_text("") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "empty.txt" in content + # Adjust regex to match actual output + assert re.search(r"FILE: empty\.txt\s*\n=+\n\s*\n", content) or "FILE: empty.txt" in content + + +def test_ingest_permission_error(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that files with permission errors are marked in content.""" + restricted_file = temp_directory / "restricted.txt" + restricted_file.write_text("top secret") + restricted_file.chmod(0o000) + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "restricted.txt" in content + assert "Error reading file" in content + restricted_file.chmod(0o644) + + +def test_ingest_weird_encoding(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that files with non-UTF8 encoding are marked in content.""" + weird_file = temp_directory / "weird.txt" + weird_file.write_bytes("café".encode("utf-16")) + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "weird.txt" in content + assert "[Encoding error]" in content or "[Binary file]" in content + + +def test_ingest_deeply_nested_structure(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that deeply nested files are included if patterns match.""" + nested_dir = temp_directory / "a/b/c/d/e" + nested_dir.mkdir(parents=True) + nested_file = nested_dir / "deep.txt" + nested_file.write_text("deep content") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = {"**/deep.txt"} + + summary, _, content = ingest_query(sample_query) + assert "deep.txt" in content + assert "Files analyzed:" in summary + + +def test_include_nonexistent_extension(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that include patterns with nonexistent extensions match no files.""" + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = {"*.xyz"} + summary, _, content = ingest_query(sample_query) + assert "Files analyzed: 0" in summary + assert content.strip() == "" + + +def test_ignore_nonexistent_files(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that ignore patterns with nonexistent files do not affect results.""" + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.ignore_patterns = {"nonexistent.txt"} + summary, _, content = ingest_query(sample_query) + assert "file1.txt" in content + assert "Files analyzed:" in summary + + +def test_unicode_special_char_filenames(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test ingestion of files with unicode/special characters in filenames.""" + unicode_file = temp_directory / "unicodé_文件.txt" + unicode_file.write_text("hello unicode") + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + _, _, content = ingest_query(sample_query) + assert "unicodé_文件.txt" in content + assert "hello unicode" in content + + +def test_mixed_line_endings(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test ingestion of files with mixed line endings (LF/CRLF).""" + lf_file = temp_directory / "lf.txt" + crlf_file = temp_directory / "crlf.txt" + lf_file.write_text("line1\nline2\n") + crlf_file.write_text("line1\r\nline2\r\n") + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + _, _, content = ingest_query(sample_query) + assert "lf.txt" in content + assert "crlf.txt" in content + assert "line1" in content + assert "line2" in content + + +def test_mixed_file_types_in_directory(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test ingestion with a mix of file types in one directory.""" + (temp_directory / "text.txt").write_text("text") + (temp_directory / "binary.bin").write_bytes(b"\x00\xff") + (temp_directory / "symlink.txt").symlink_to(temp_directory / "text.txt") + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + _, _, content = ingest_query(sample_query) + assert "text.txt" in content + assert "binary.bin" in content + assert "[Binary file]" in content + assert "symlink.txt" in content + assert "SYMLINK:" in content + + +def test_pattern_matching_various_globs(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that various glob patterns correctly match files for ingestion.""" + (temp_directory / "foo.txt").write_text("foo") + (temp_directory / "bar.py").write_text("bar") + (temp_directory / "baz.md").write_text("baz") + subdir = temp_directory / "sub" + subdir.mkdir() + (subdir / "nested.py").write_text("nested") + (subdir / "nested.txt").write_text("nested txt") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = {"*.txt"} + sample_query.ignore_patterns = set() + _, _, content = ingest_query(sample_query) + assert "foo.txt" in content + assert "bar.py" not in content + assert "baz.md" not in content + assert "nested.txt" in content + + sample_query.include_patterns = {"**/*.py"} + _, _, content = ingest_query(sample_query) + assert "bar.py" in content + assert "nested.py" in content + assert "foo.txt" not in content + + sample_query.include_patterns = {"*.md", "sub/*.txt"} + _, _, content = ingest_query(sample_query) + assert "baz.md" in content + assert "nested.txt" in content + assert "foo.txt" not in content + assert "bar.py" not in content + + sample_query.include_patterns = set() + sample_query.ignore_patterns = {"*.py", "sub/*.py"} + _, _, content = ingest_query(sample_query) + assert "foo.txt" in content + assert "baz.md" in content + assert "bar.py" not in content + assert "nested.py" not in content