Skip to content

feat: improve unit tests quality and coverage #433

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
222 changes: 216 additions & 6 deletions tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) ->
assert "dir2/file_dir2.txt" in content


# TODO: Additional tests:
# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"].
# - Edge cases with weird file names or deep subdirectory structures.
# TODO : def test_include_nonexistent_extension


Expand Down Expand Up @@ -222,14 +219,227 @@ def test_include_ignore_patterns(
assert (num_files_match := num_files_regex.search(summary)) is not None
assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"]

# Check presence of key files in the content
for expected_content_item in pattern_scenario["expected_content"]:
assert expected_content_item in content

# check presence of included directories in structure
for expected_structure_item in pattern_scenario["expected_structure"]:
assert expected_structure_item in structure

# check non-presence of non-included directories in structure
for expected_not_structure_item in pattern_scenario["expected_not_structure"]:
assert expected_not_structure_item not in structure


def test_ingest_skips_binary_files(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that binary files are not included as raw content, but as a marker."""
binary_file = temp_directory / "binary.bin"
binary_file.write_bytes(b"\x00\xff\x00\xff")

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "binary.bin" in content
assert "[Binary file]" in content
assert b"\x00\xff\x00\xff".decode(errors="ignore") not in content


def test_ingest_binary_file_summary(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Ensure binary files are counted and marked in content."""
binary_file = temp_directory / "binary.bin"
binary_file.write_bytes(b"\x00\xff\x00\xff")
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
summary, _, content = ingest_query(sample_query)
assert "binary.bin" in content
assert "[Binary file]" in content
assert "Files analyzed:" in summary


def test_ingest_skips_symlinks(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that symlinks are not included as file content, but as a marker."""
target_file = temp_directory / "file1.txt"
target_file.write_text("hello")
symlink = temp_directory / "symlink.txt"
symlink.symlink_to(target_file)

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "symlink.txt" in content
assert "SYMLINK: symlink.txt" in content
assert "hello" not in content.split("SYMLINK: symlink.txt")[1]


def test_ingest_symlink_summary(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Ensure symlinks are marked in content."""
target_file = temp_directory / "file1.txt"
target_file.write_text("hello")
symlink = temp_directory / "symlink.txt"
symlink.symlink_to(target_file)
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
summary, _, content = ingest_query(sample_query)
assert "symlink.txt" in content
assert "SYMLINK: symlink.txt" in content
assert "Files analyzed:" in summary


def test_ingest_large_file_handling(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that files exceeding max_file_size are skipped."""
large_file = temp_directory / "large.txt"
large_file.write_text("A" * (sample_query.max_file_size + 1))

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "large.txt" not in content, "Large files should be skipped from content."


def test_ingest_hidden_files(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that hidden files are handled according to ignore/include patterns."""
hidden_file = temp_directory / ".hidden.txt"
hidden_file.write_text("secret")

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.ignore_patterns = {".hidden.txt"}

summary, _, content = ingest_query(sample_query)
assert ".hidden.txt" not in content
assert ".hidden.txt" not in summary


def test_ingest_empty_file(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that empty files are included but content is empty."""
empty_file = temp_directory / "empty.txt"
empty_file.write_text("")

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "empty.txt" in content
# Adjust regex to match actual output
assert re.search(r"FILE: empty\.txt\s*\n=+\n\s*\n", content) or "FILE: empty.txt" in content


def test_ingest_permission_error(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that files with permission errors are marked in content."""
restricted_file = temp_directory / "restricted.txt"
restricted_file.write_text("top secret")
restricted_file.chmod(0o000)

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "restricted.txt" in content
assert "Error reading file" in content
restricted_file.chmod(0o644)


def test_ingest_weird_encoding(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that files with non-UTF8 encoding are marked in content."""
weird_file = temp_directory / "weird.txt"
weird_file.write_bytes("café".encode("utf-16"))

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None

_, _, content = ingest_query(sample_query)
assert "weird.txt" in content
assert "[Encoding error]" in content or "[Binary file]" in content


def test_ingest_deeply_nested_structure(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that deeply nested files are included if patterns match."""
nested_dir = temp_directory / "a/b/c/d/e"
nested_dir.mkdir(parents=True)
nested_file = nested_dir / "deep.txt"
nested_file.write_text("deep content")

sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.include_patterns = {"**/deep.txt"}

summary, _, content = ingest_query(sample_query)
assert "deep.txt" in content
assert "Files analyzed:" in summary


def test_include_nonexistent_extension(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that include patterns with nonexistent extensions match no files."""
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.include_patterns = {"*.xyz"}
summary, _, content = ingest_query(sample_query)
assert "Files analyzed: 0" in summary
assert content.strip() == ""


def test_ignore_nonexistent_files(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test that ignore patterns with nonexistent files do not affect results."""
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
sample_query.ignore_patterns = {"nonexistent.txt"}
summary, _, content = ingest_query(sample_query)
assert "file1.txt" in content
assert "Files analyzed:" in summary


def test_unicode_special_char_filenames(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test ingestion of files with unicode/special characters in filenames."""
unicode_file = temp_directory / "unicodé_文件.txt"
unicode_file.write_text("hello unicode")
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
_, _, content = ingest_query(sample_query)
assert "unicodé_文件.txt" in content
assert "hello unicode" in content


def test_mixed_line_endings(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test ingestion of files with mixed line endings (LF/CRLF)."""
lf_file = temp_directory / "lf.txt"
crlf_file = temp_directory / "crlf.txt"
lf_file.write_text("line1\nline2\n")
crlf_file.write_text("line1\r\nline2\r\n")
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
_, _, content = ingest_query(sample_query)
assert "lf.txt" in content
assert "crlf.txt" in content
assert "line1" in content
assert "line2" in content


def test_mixed_file_types_in_directory(temp_directory: Path, sample_query: IngestionQuery) -> None:
"""Test ingestion with a mix of file types in one directory."""
(temp_directory / "text.txt").write_text("text")
(temp_directory / "binary.bin").write_bytes(b"\x00\xff")
(temp_directory / "symlink.txt").symlink_to(temp_directory / "text.txt")
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
_, _, content = ingest_query(sample_query)
assert "text.txt" in content
assert "binary.bin" in content
assert "[Binary file]" in content
assert "symlink.txt" in content
assert "SYMLINK:" in content