Imageomics · coolnipunj · Jun 13, 2025 · Jun 13, 2025 · Jun 13, 2025 · Jun 17, 2025
diff --git a/README.md b/README.md
@@ -117,6 +117,32 @@ cat examples/checksums.csv
 > examples/example_content/dir/.hidden_dir/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df
 >```
 
+- **ZIP Support:**
+  sum-buddy supports processing ZIP files. When a ZIP file is encountered, it will:
+  - Calculate the checksum of the ZIP file itself.
+  - List each file inside the ZIP as `zipfile.zip/filename` with its own checksum, using in-memory streaming (no extraction to disk).
+
+  Example:
+  ```bash
+  sum-buddy --output-file examples/checksums_zip.csv examples/example_content/
+  ```
+  > Output
+  > ```console
+  > Calculating md5 checksums on examples/example_content/: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15109.16it/s]
+  > md5 checksums for examples/example_content/ written to examples/checksums_zip.csv
+  > ```
+  ```bash
+  cat examples/checksums_zip.csv
+  ```
+  > Output:
+  > ```console
+  > filepath,filename,md5
+  > examples/example_content/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df
+  > examples/example_content/testzip.zip,testzip.zip,dcf68ba27f40590ff899b63d44e18836
+  > examples/example_content/testzip.zip/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df
+  > examples/example_content/testzip.zip/dir/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df
+  > examples/example_content/dir/file.txt,file.txt,7d52c7437e9af58dac029dd11b1024df
+  > ```
 
 If only a target directory is passed, the default settings are to ignore hidden files and directories (those that begin with a `.`), use the `md5` algorithm, and print output to `stdout`, which can be piped (`|`).
 
@@ -172,9 +198,11 @@ pip install -e ".[dev]"
 3. Install pre-commit hook
 ```bash
 pre-commit install
-pre-commit autoupdate # optionally update
 ```
-4. Run tests:
+
+### Tests
+
+To run all tests:
 ```bash
-pytest
+python -m pytest
 ```
diff --git a/src/sumbuddy/__main__.py b/src/sumbuddy/__main__.py
@@ -7,6 +7,7 @@
 from tqdm import tqdm
 import sys
 import os
+from sumbuddy.archive import ArchiveHandler
 
 def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hidden=False, algorithm='md5', length=None):
     """
@@ -24,21 +25,23 @@ def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hi
     mapper = Mapper()
 
     if os.path.isfile(input_path):
-        file_paths = [input_path]
+        regular_files = [input_path]
+        zip_archives = []
         if ignore_file:
             print("Warning: --ignore-file (-i) flag is ignored when input is a single file.")
         if include_hidden:
             print("Warning: --include-hidden (-H) flag is ignored when input is a single file.")
     else:
         try:
-            file_paths = mapper.gather_file_paths(input_path, ignore_file=ignore_file, include_hidden=include_hidden)
+            regular_files, zip_archives = mapper.gather_file_paths(input_path, ignore_file=ignore_file, include_hidden=include_hidden)
         except (EmptyInputDirectoryError, NoFilesAfterFilteringError) as e:
             sys.exit(str(e))
 
     # Exclude the output file from being hashed
     if output_filepath:
         output_file_abs_path = os.path.abspath(output_filepath)
-        file_paths = [path for path in file_paths if os.path.abspath(path) != output_file_abs_path]
+        regular_files = [path for path in regular_files if os.path.abspath(path) != output_file_abs_path]
+        zip_archives = [path for path in zip_archives if os.path.abspath(path) != output_file_abs_path]
 
     hasher = Hasher(algorithm)
     output_stream = open(output_filepath, 'w', newline='') if output_filepath else sys.stdout
@@ -48,10 +51,25 @@ def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hi
         writer.writerow(["filepath", "filename", f"{algorithm}"])
 
         disable_tqdm = output_filepath is None
-        for file_path in tqdm(file_paths, desc=f"Calculating {algorithm} checksums on {input_path}", disable=disable_tqdm):
-            checksum = hasher.checksum_file(file_path, algorithm=algorithm, length=length)
-            writer.writerow([file_path, os.path.basename(file_path), checksum])
-
+        total_files = len(regular_files) + sum(1 for z in zip_archives for _ in ArchiveHandler.stream_zip(z)) + len(zip_archives)
-        total_files = len(regular_files) + sum(1 for z in zip_archives for _ in ArchiveHandler.stream_zip(z)) + len(zip_archives)
+        total_files = len(regular_files) + len(zip_archives)  # Start with regular files and ZIP archives themselves
-        total_files = len(regular_files) + sum(1 for z in zip_archives for _ in ArchiveHandler.stream_zip(z)) + len(zip_archives)
+        total_files = len(regular_files) + len(zip_archives)  # Start with regular files and ZIP archives themselves
+        with tqdm(total=total_files, desc=f"Calculating {algorithm} checksums on {input_path}", disable=disable_tqdm) as pbar:
+            # Process regular files
+            for file_path in regular_files:
+                checksum = hasher.checksum_file(file_path, algorithm=algorithm, length=length)
+                writer.writerow([file_path, os.path.basename(file_path), checksum])
+                pbar.update(1)
+            # Process zip archives
+            for zip_path in zip_archives:
+                # Write checksum for the zip file itself
+                checksum = hasher.checksum_file(zip_path, algorithm=algorithm, length=length)
+                writer.writerow([zip_path, os.path.basename(zip_path), checksum])
+                pbar.update(1)
+                # Write checksums for each file inside the zip
+                for member, file_obj in ArchiveHandler.stream_zip(zip_path):
+                    virtual_path = f"{zip_path}/{member}"
+                    checksum = hasher.checksum_file(file_obj, algorithm=algorithm, length=length)
+                    writer.writerow([virtual_path, os.path.basename(member), checksum])
+                    pbar.update(1)
     finally:
         if output_filepath:
             output_stream.close()
@@ -60,7 +78,6 @@ def get_checksums(input_path, output_filepath=None, ignore_file=None, include_hi
         print(f"{algorithm} checksums for {input_path} written to {output_filepath}")
 
 def main():
-
     available_algorithms = ', '.join(hashlib.algorithms_available)
 
     parser = argparse.ArgumentParser(description="Generate CSV with filepath, filename, and checksums for all files in a given directory (or a single file)")

diff --git a/src/sumbuddy/archive.py b/src/sumbuddy/archive.py
@@ -0,0 +1,65 @@
+import os
+import zipfile
+import tempfile
+import shutil
+
+class ArchiveHandler:
+    def __init__(self):
+        self.temp_dir = None
+
+    def process_zip(self, zip_path, root_dir):
+        """
+        Process a zip file and return paths to its contents.
+
+        Parameters:
+        ------------
+        zip_path - String. Path to the zip file.
+        root_dir - String. Root directory for relative path calculations.
+
+        Returns:
+        ---------
+        List of tuples (file_path, relative_path) for files in the zip.
+        """
+        if not zipfile.is_zipfile(zip_path):
+            return []
+
+        # Create a temporary directory for extraction
+        self.temp_dir = tempfile.mkdtemp()
+
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                # Extract all contents to temp directory
+                zip_ref.extractall(self.temp_dir)
-        self.temp_dir = tempfile.mkdtemp()
-        
-        try:
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                # Extract all contents to temp directory
-                zip_ref.extractall(self.temp_dir)
+        temp_dir = tempfile.mkdtemp()
+        
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                # Extract all contents to temp directory
+                zip_ref.extractall(temp_dir)
-        self.temp_dir = tempfile.mkdtemp()
-        
-        try:
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                # Extract all contents to temp directory
-                zip_ref.extractall(self.temp_dir)
+        temp_dir = tempfile.mkdtemp()
+        
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                # Extract all contents to temp directory
+                zip_ref.extractall(temp_dir)
+
+                # Get list of all files in the zip
+                file_paths = []
+                for member in zip_ref.namelist():
+                    # Only add files, not directories
+                    if member.endswith('/'):
+                        continue
+                    full_path = os.path.join(self.temp_dir, member)
+                    # The path as it should appear in the CSV: zip_path/member
+                    rel_path = f"{zip_path}/{member}"
+                    file_paths.append((full_path, rel_path))
+                return file_paths
+        except Exception as e:
+            self.cleanup()
+            raise e
+
+    def cleanup(self):
+        """Clean up temporary directory if it exists."""
+        if self.temp_dir and os.path.exists(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+            self.temp_dir = None
+
+    @staticmethod
+    def stream_zip(zip_path):
+        """
+        Yield (name, file-like object) for each file in the ZIP archive.
+        Only yields regular files (not directories).
+        """
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            for member in zip_ref.namelist():
+                if member.endswith('/'):
+                    continue  # skip directories
+                yield member, zip_ref.open(member) 
diff --git a/src/sumbuddy/hasher.py b/src/sumbuddy/hasher.py
@@ -5,13 +5,13 @@ class Hasher:
     def __init__(self, algorithm='md5'):
         self.algorithm = algorithm
 
-    def checksum_file(self, file_path, algorithm=None, length=None):
+    def checksum_file(self, file_path_or_obj, algorithm=None, length=None):
         """
         Calculate the checksum of a file using the specified algorithm.
 
         Parameters:
         ------------
-        file_path - String. Path to file to apply checksum function.
+        file_path_or_obj - String or file-like object. Path to file or file-like object to apply checksum function.
         algorithm - String. Hash function to use for checksums. Default: 'md5', see options with 'hashlib.algorithms_available'.
         length - Integer [optional]. Length of the digest for SHAKE and BLAKE algorithms in bytes.
 
@@ -55,9 +55,14 @@ def checksum_file(self, file_path, algorithm=None, length=None):
                 raise LengthUsedForFixedLengthHashError(algorithm)
             hash_func = hashlib.new(algorithm)
 
-        # Read the file and update the hash function
-        with open(file_path, "rb") as f:
-            for chunk in iter(lambda: f.read(4096), b""):
+        # Handle both file paths and file-like objects
+        if isinstance(file_path_or_obj, str):
+            with open(file_path_or_obj, "rb") as f:
+                for chunk in iter(lambda: f.read(4096), b""):
+                    hash_func.update(chunk)
+        else:
+            # Assume it's a file-like object
-            # Assume it's a file-like object
+            # Validate that the object supports the 'read' method
+            if not hasattr(file_path_or_obj, 'read'):
+                raise TypeError("The provided object is not a valid file-like object. It must support the 'read()' method.")
-            # Assume it's a file-like object
+            # Validate that the object supports the 'read' method
+            if not hasattr(file_path_or_obj, 'read'):
+                raise TypeError("The provided object is not a valid file-like object. It must support the 'read()' method.")
+            for chunk in iter(lambda: file_path_or_obj.read(4096), b""):
                 hash_func.update(chunk)
 
         # Return the hash digest

diff --git a/src/sumbuddy/mapper.py b/src/sumbuddy/mapper.py
@@ -1,10 +1,13 @@
 import os
+import zipfile
 from sumbuddy.filter import Filter
 from sumbuddy.exceptions import EmptyInputDirectoryError, NoFilesAfterFilteringError, NotADirectoryError
+from sumbuddy.archive import ArchiveHandler
 
 class Mapper:
     def __init__(self):
         self.filter_manager = Filter()
+        self.archive_handler = ArchiveHandler()
 
     def reset_filter(self, ignore_file=None, include_hidden=False):
         """
@@ -28,24 +31,16 @@ def reset_filter(self, ignore_file=None, include_hidden=False):
     def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=False):
         """
         Generate list of file paths in the input directory based on ignore pattern rules.
-
-        Parameters:
-        ------------
-        input_directory - String. Directory to traverse for files.
-        ignore_file - String [optional]. Filepath for the ignore patterns file.
-        include_hidden - Boolean [optional]. Whether to include hidden files.
-
-        Returns:
-        ---------
-        file_paths - List. Files in input_directory that are not ignored.
+        Returns a tuple: (regular_files, zip_archives)
         """
 
         if not os.path.isdir(input_directory):
             raise NotADirectoryError(input_directory)
 
         self.reset_filter(ignore_file=ignore_file, include_hidden=include_hidden)
 
-        file_paths = []
+        regular_files = []
+        zip_archives = []
         root_directory = os.path.abspath(input_directory)
         has_files = False
 
@@ -55,11 +50,14 @@ def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=Fa
             for name in files:
                 file_path = os.path.join(root, name)
                 if self.filter_manager.should_include(file_path, root_directory):
-                    file_paths.append(file_path)
+                    if zipfile.is_zipfile(file_path):
+                        zip_archives.append(file_path)
+                    else:
+                        regular_files.append(file_path)
 
         if not has_files:
             raise EmptyInputDirectoryError(input_directory)
-        if not file_paths:
+        if not (regular_files or zip_archives):
             raise NoFilesAfterFilteringError(input_directory, ignore_file)
 
-        return file_paths
+        return regular_files, zip_archives