From 22e7c3488344aabbbf7cd9f5d8302f1906cb9685 Mon Sep 17 00:00:00 2001
From: ix-56h <n.guintini@protonmail.com>
Date: Fri, 4 Jul 2025 00:34:12 +0200
Subject: [PATCH 1/2] fix binary file check

---
 src/gitingest/schemas/filesystem.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py
index 2fbe56d1..05300fa7 100644
--- a/src/gitingest/schemas/filesystem.py
+++ b/src/gitingest/schemas/filesystem.py
@@ -143,7 +143,7 @@ def content(self) -> str:  # pylint: disable=too-many-return-statements
         if chunk == b"":
             return "[Empty file]"
 
-        if not _decodes(chunk, "utf-8"):
+        if is_binary_file(chunk):
             return "[Binary file]"
 
         # Find the first encoding that decodes the sample
@@ -160,3 +160,15 @@ def content(self) -> str:  # pylint: disable=too-many-return-statements
                 return fp.read()
         except (OSError, UnicodeDecodeError) as exc:
             return f"Error reading file with {good_enc!r}: {exc}"
+
+
+def is_binary_file(file_contents: bytes | None) -> bool:
+    """Check whether a file is binary by reading its first 1024 bytes and looking for non-text characters."""
+    if not file_contents:
+        return False  # Empty files are not binary
+
+    text_characters = bytes(
+        {7, 8, 9, 10, 12, 13, 27}.union(set(range(0x20, 0x100)) - {0x7F}),
+    )
+    # If translate returns any bytes, those are non-text (binary) bytes
+    return bool(file_contents.translate(None, text_characters))

From b62b1eadb57ce3c2326ae5fe9210b8f6354367bb Mon Sep 17 00:00:00 2001
From: ix-56h <n.guintini@protonmail.com>
Date: Fri, 4 Jul 2025 01:29:42 +0200
Subject: [PATCH 2/2] handle specific utf-16 encoding for windows

---
 src/gitingest/utils/file_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gitingest/utils/file_utils.py b/src/gitingest/utils/file_utils.py
index 2c6ef74d..a1b9a0e2 100644
--- a/src/gitingest/utils/file_utils.py
+++ b/src/gitingest/utils/file_utils.py
@@ -27,9 +27,11 @@ def _get_preferred_encodings() -> list[str]:
         platform's default encoding followed by common fallback encodings.
 
     """
-    encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"]
+    encodings = [locale.getpreferredencoding(), "utf-8", "utf-16le", "utf-8-sig", "latin"]
     if platform.system() == "Windows":
-        encodings += ["cp1252", "iso-8859-1"]
+        encodings += ["utf-16be", "cp1252", "iso-8859-1"]
+    else:
+        encodings += ["utf-16"]
     return list(dict.fromkeys(encodings))