From 22e7c3488344aabbbf7cd9f5d8302f1906cb9685 Mon Sep 17 00:00:00 2001 From: ix-56h Date: Fri, 4 Jul 2025 00:34:12 +0200 Subject: [PATCH 1/2] fix binary file check --- src/gitingest/schemas/filesystem.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 2fbe56d1..05300fa7 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -143,7 +143,7 @@ def content(self) -> str: # pylint: disable=too-many-return-statements if chunk == b"": return "[Empty file]" - if not _decodes(chunk, "utf-8"): + if is_binary_file(chunk): return "[Binary file]" # Find the first encoding that decodes the sample @@ -160,3 +160,15 @@ def content(self) -> str: # pylint: disable=too-many-return-statements return fp.read() except (OSError, UnicodeDecodeError) as exc: return f"Error reading file with {good_enc!r}: {exc}" + + +def is_binary_file(file_contents: bytes | None) -> bool: + """Check whether a file is binary by reading its first 1024 bytes and looking for non-text characters.""" + if not file_contents: + return False # Empty files are not binary + + text_characters = bytes( + {7, 8, 9, 10, 12, 13, 27}.union(set(range(0x20, 0x100)) - {0x7F}), + ) + # If translate returns any bytes, those are non-text (binary) bytes + return bool(file_contents.translate(None, text_characters)) From b62b1eadb57ce3c2326ae5fe9210b8f6354367bb Mon Sep 17 00:00:00 2001 From: ix-56h Date: Fri, 4 Jul 2025 01:29:42 +0200 Subject: [PATCH 2/2] handle specific utf-16 encoding for windows --- src/gitingest/utils/file_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gitingest/utils/file_utils.py b/src/gitingest/utils/file_utils.py index 2c6ef74d..a1b9a0e2 100644 --- a/src/gitingest/utils/file_utils.py +++ b/src/gitingest/utils/file_utils.py @@ -27,9 +27,11 @@ def _get_preferred_encodings() -> list[str]: platform's default encoding followed by common fallback encodings. """ - encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"] + encodings = [locale.getpreferredencoding(), "utf-8", "utf-16le", "utf-8-sig", "latin"] if platform.system() == "Windows": - encodings += ["cp1252", "iso-8859-1"] + encodings += ["utf-16be", "cp1252", "iso-8859-1"] + else: + encodings += ["utf-16"] return list(dict.fromkeys(encodings))