-
Notifications
You must be signed in to change notification settings - Fork 0
Feature/zip support #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
e54368c
4e5dcd9
63c64f7
dfe3df3
e6518bd
c3c2cf3
3f171f0
edd8e65
7d9df08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,65 @@ | ||||||||||||||||||||||||||
import os | ||||||||||||||||||||||||||
import zipfile | ||||||||||||||||||||||||||
import tempfile | ||||||||||||||||||||||||||
import shutil | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
class ArchiveHandler: | ||||||||||||||||||||||||||
def __init__(self): | ||||||||||||||||||||||||||
self.temp_dir = None | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
def process_zip(self, zip_path, root_dir): | ||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The process_zip method creates temporary directories but the cleanup responsibility is unclear. Consider using a context manager or ensuring cleanup is called consistently to prevent temporary directory accumulation. Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, with |
||||||||||||||||||||||||||
""" | ||||||||||||||||||||||||||
Process a zip file and return paths to its contents. | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
Parameters: | ||||||||||||||||||||||||||
------------ | ||||||||||||||||||||||||||
zip_path - String. Path to the zip file. | ||||||||||||||||||||||||||
root_dir - String. Root directory for relative path calculations. | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
Returns: | ||||||||||||||||||||||||||
--------- | ||||||||||||||||||||||||||
List of tuples (file_path, relative_path) for files in the zip. | ||||||||||||||||||||||||||
""" | ||||||||||||||||||||||||||
if not zipfile.is_zipfile(zip_path): | ||||||||||||||||||||||||||
return [] | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
# Create a temporary directory for extraction | ||||||||||||||||||||||||||
self.temp_dir = tempfile.mkdtemp() | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
try: | ||||||||||||||||||||||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref: | ||||||||||||||||||||||||||
# Extract all contents to temp directory | ||||||||||||||||||||||||||
zip_ref.extractall(self.temp_dir) | ||||||||||||||||||||||||||
Comment on lines
+27
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The ArchiveHandler stores the temporary directory in an instance variable, which may be overwritten if process_zip is called multiple times. Consider creating a local temporary directory for each call or managing multiple temp directories to ensure that all extracted files are properly cleaned up.
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
# Get list of all files in the zip | ||||||||||||||||||||||||||
file_paths = [] | ||||||||||||||||||||||||||
for member in zip_ref.namelist(): | ||||||||||||||||||||||||||
# Only add files, not directories | ||||||||||||||||||||||||||
if member.endswith('/'): | ||||||||||||||||||||||||||
continue | ||||||||||||||||||||||||||
full_path = os.path.join(self.temp_dir, member) | ||||||||||||||||||||||||||
# The path as it should appear in the CSV: zip_path/member | ||||||||||||||||||||||||||
rel_path = f"{zip_path}/{member}" | ||||||||||||||||||||||||||
file_paths.append((full_path, rel_path)) | ||||||||||||||||||||||||||
return file_paths | ||||||||||||||||||||||||||
except Exception as e: | ||||||||||||||||||||||||||
self.cleanup() | ||||||||||||||||||||||||||
raise e | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
def cleanup(self): | ||||||||||||||||||||||||||
"""Clean up temporary directory if it exists.""" | ||||||||||||||||||||||||||
if self.temp_dir and os.path.exists(self.temp_dir): | ||||||||||||||||||||||||||
shutil.rmtree(self.temp_dir) | ||||||||||||||||||||||||||
self.temp_dir = None | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
@staticmethod | ||||||||||||||||||||||||||
def stream_zip(zip_path): | ||||||||||||||||||||||||||
""" | ||||||||||||||||||||||||||
Yield (name, file-like object) for each file in the ZIP archive. | ||||||||||||||||||||||||||
Only yields regular files (not directories). | ||||||||||||||||||||||||||
""" | ||||||||||||||||||||||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref: | ||||||||||||||||||||||||||
for member in zip_ref.namelist(): | ||||||||||||||||||||||||||
if member.endswith('/'): | ||||||||||||||||||||||||||
continue # skip directories | ||||||||||||||||||||||||||
yield member, zip_ref.open(member) |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -5,13 +5,13 @@ class Hasher: | |||||||||
def __init__(self, algorithm='md5'): | ||||||||||
self.algorithm = algorithm | ||||||||||
|
||||||||||
def checksum_file(self, file_path, algorithm=None, length=None): | ||||||||||
def checksum_file(self, file_path_or_obj, algorithm=None, length=None): | ||||||||||
""" | ||||||||||
Calculate the checksum of a file using the specified algorithm. | ||||||||||
|
||||||||||
Parameters: | ||||||||||
------------ | ||||||||||
file_path - String. Path to file to apply checksum function. | ||||||||||
file_path_or_obj - String or file-like object. Path to file or file-like object to apply checksum function. | ||||||||||
algorithm - String. Hash function to use for checksums. Default: 'md5', see options with 'hashlib.algorithms_available'. | ||||||||||
length - Integer [optional]. Length of the digest for SHAKE and BLAKE algorithms in bytes. | ||||||||||
|
||||||||||
|
@@ -55,9 +55,14 @@ def checksum_file(self, file_path, algorithm=None, length=None): | |||||||||
raise LengthUsedForFixedLengthHashError(algorithm) | ||||||||||
hash_func = hashlib.new(algorithm) | ||||||||||
|
||||||||||
# Read the file and update the hash function | ||||||||||
with open(file_path, "rb") as f: | ||||||||||
for chunk in iter(lambda: f.read(4096), b""): | ||||||||||
# Handle both file paths and file-like objects | ||||||||||
if isinstance(file_path_or_obj, str): | ||||||||||
with open(file_path_or_obj, "rb") as f: | ||||||||||
for chunk in iter(lambda: f.read(4096), b""): | ||||||||||
hash_func.update(chunk) | ||||||||||
else: | ||||||||||
# Assume it's a file-like object | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The file-like object handling should include error handling for cases where the object doesn't support the read() method. Consider adding a try-except block or type checking to provide a clearer error message.
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm having a hard time concocting a scenario where |
||||||||||
for chunk in iter(lambda: file_path_or_obj.read(4096), b""): | ||||||||||
hash_func.update(chunk) | ||||||||||
|
||||||||||
# Return the hash digest | ||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,13 @@ | ||
import os | ||
import zipfile | ||
from sumbuddy.filter import Filter | ||
from sumbuddy.exceptions import EmptyInputDirectoryError, NoFilesAfterFilteringError, NotADirectoryError | ||
from sumbuddy.archive import ArchiveHandler | ||
|
||
class Mapper: | ||
def __init__(self): | ||
self.filter_manager = Filter() | ||
self.archive_handler = ArchiveHandler() | ||
|
||
def reset_filter(self, ignore_file=None, include_hidden=False): | ||
""" | ||
|
@@ -28,24 +31,16 @@ def reset_filter(self, ignore_file=None, include_hidden=False): | |
def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=False): | ||
""" | ||
Generate list of file paths in the input directory based on ignore pattern rules. | ||
|
||
Parameters: | ||
------------ | ||
input_directory - String. Directory to traverse for files. | ||
ignore_file - String [optional]. Filepath for the ignore patterns file. | ||
include_hidden - Boolean [optional]. Whether to include hidden files. | ||
|
||
Returns: | ||
--------- | ||
file_paths - List. Files in input_directory that are not ignored. | ||
Returns a tuple: (regular_files, zip_archives) | ||
Comment on lines
33
to
+34
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The docstring should be updated to include the complete parameter and return documentation that was removed. The current docstring is incomplete and missing parameter descriptions and detailed return value information. Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, let's keep the inputs back in the format they were in previously |
||
""" | ||
|
||
if not os.path.isdir(input_directory): | ||
raise NotADirectoryError(input_directory) | ||
|
||
self.reset_filter(ignore_file=ignore_file, include_hidden=include_hidden) | ||
|
||
file_paths = [] | ||
regular_files = [] | ||
zip_archives = [] | ||
root_directory = os.path.abspath(input_directory) | ||
has_files = False | ||
|
||
|
@@ -55,11 +50,14 @@ def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=Fa | |
for name in files: | ||
file_path = os.path.join(root, name) | ||
if self.filter_manager.should_include(file_path, root_directory): | ||
file_paths.append(file_path) | ||
if zipfile.is_zipfile(file_path): | ||
zip_archives.append(file_path) | ||
else: | ||
regular_files.append(file_path) | ||
|
||
if not has_files: | ||
raise EmptyInputDirectoryError(input_directory) | ||
if not file_paths: | ||
if not (regular_files or zip_archives): | ||
raise NoFilesAfterFilteringError(input_directory, ignore_file) | ||
|
||
return file_paths | ||
return regular_files, zip_archives |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The total_files calculation iterates through all ZIP archives twice - once to count files for the progress bar and again during processing. This is inefficient for large ZIP files. Consider caching the file counts or restructuring the progress tracking.
Copilot uses AI. Check for mistakes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Non-blocking for now. Can add a new issue to use file content headers and perhaps use more generic "Archive" language in
__main__.py
rather than referring to zip once a new archive file type is supported.