Skip to content

Commit 6d03579

Browse files
authored
Merge pull request #16 from MK2112/11-batch-pdf-splitting-merging
Integrate PDF splitting and merging
2 parents 6cb0072 + 3f62eff commit 6d03579

12 files changed

+364
-38
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ You can structure a command in three fundamental ways:
5959
| `-q` or </br>`--quality` | Set output file quality, either `low`, `medium`, or `high`; default is same as input. |
6060
| `-m` or </br>`--merge` | Merge movie file with equally named audio file to become its audio track. |
6161
| `-c` or </br>`--concat` | Concatenate input files of the same type (images, audio, video) into one output file (e.g. `concatenated_video.mp4` for movie files, `concatenated_audio.mp3` for audio files). |
62+
| `-s` or </br>`--split` | Split a PDF into multiple files, either by page count or page ranges, e.g. `1-2,3-5` or `10` or `1-3,2-6,8-end` or `1-5,rest`. |
6263
| `-a` or </br>`--across` | Merge/Concatenate across directories when multiple directories are provided. |
6364
| `-w` or </br>`--web` | Ignores all other arguments, starts browser + a web server at `http://localhost:5000`. |
6465
| `-d` or </br>`--delete` | Delete input files after conversion. |
@@ -103,6 +104,11 @@ You may also convert the images contained in a DOCX file to an MP4 video:
103104
python any_to_any.py -i /path/to/file.docx -f mp4
104105
```
105106

107+
Split a PDF into multiple files, either by page count or page ranges, e.g. `1-2,3-5` or `10` or `1-3,2-6,8-end` or `1-5,rest`:
108+
```python
109+
python any_to_any.py -i /path/to/file.pdf -s 1-3,2-6,8-end
110+
```
111+
106112
### Directory Processing
107113
Directory Processing is useful when you want to work with multiple files in a directory
108114

any_to_any.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,26 @@
122122
type=str,
123123
required=False,
124124
)
125+
parser.add_argument(
126+
"-s",
127+
"--split",
128+
help=f"{lang.get_translation('split_help', controller.locale)}",
129+
type=str,
130+
required=False,
131+
)
125132

126133
args = vars(parser.parse_args())
127134

135+
if args["split"] and (args["merge"] or args["concat"]):
136+
parser.error(
137+
f"{lang.get_translation('split_merge_error', controller.locale)}"
138+
)
139+
140+
if args["merge"] and args["concat"]:
141+
parser.error(
142+
f"{lang.get_translation('merge_concat_error', controller.locale)}"
143+
)
144+
128145
if args["web"]:
129146
# Check for web frontend request
130147
if os.name in ["nt"]:
@@ -139,6 +156,7 @@
139156
output=args["output"],
140157
framerate=args["framerate"],
141158
quality=args["quality"],
159+
split=args["split"],
142160
merge=args["merge"],
143161
concat=args["concat"],
144162
delete=args["delete"],

core/controller.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
clips_array,
2424
)
2525

26-
# TODO: Add converter-wise tests
27-
2826
class Controller:
2927
"""
3028
Taking an input directory of files, convert them to a multitude of formats.
@@ -227,6 +225,7 @@ def run(
227225
output: str,
228226
framerate: int,
229227
quality: str,
228+
split: str,
230229
merge: bool,
231230
concat: bool,
232231
delete: bool,
@@ -300,6 +299,8 @@ def run(
300299
self.merging = merge
301300
# Concatenate files of same type (img/movie/audio) back to back
302301
self.concatenating = concat
302+
# Split files into smaller parts
303+
self.page_ranges = split
303304

304305
file_paths = {}
305306
was_none, found_files = False, False
@@ -347,7 +348,7 @@ def run(
347348
f"[!] {lang.get_translation('error', self.locale)}: {lang.get_translation('dropzone_diff', self.locale)}",
348349
)
349350
self.event_logger.info(
350-
f"[+] {lang.get_translation('dropzone_active', self.locale)} {self.input}"
351+
f"[>] {lang.get_translation('dropzone_active', self.locale)} {self.input}"
351352
)
352353
self.watchdropzone(self.input)
353354
return
@@ -475,6 +476,8 @@ def process_file_paths(self, file_paths: dict) -> None:
475476
self.merge(file_paths, getattr(self, "across", False))
476477
elif self.concatenating:
477478
self.concat(file_paths, self.target_format)
479+
elif self.page_ranges is not None:
480+
self.split(file_paths, self.page_ranges)
478481
else:
479482
# Handle unsupported formats here
480483
end_with_msg(
@@ -559,6 +562,25 @@ def handle_file_event(event_type: str, file_path: str) -> None:
559562
)
560563
raise
561564

565+
def split(self, file_paths: dict, page_ranges) -> None:
566+
for doc_path_set in file_paths[Category.DOCUMENT]:
567+
if hasattr(self.prog_logger, 'shared_progress_dict') and self.prog_logger.shared_progress_dict:
568+
import threading
569+
with threading.Lock():
570+
if self.prog_logger.job_id in self.prog_logger.shared_progress_dict:
571+
self.prog_logger.shared_progress_dict[self.prog_logger.job_id].update({
572+
'status': f'splitting {doc_path_set[1]}',
573+
'last_updated': time.time()
574+
})
575+
if doc_path_set[2] == "pdf":
576+
self.doc_converter.split_pdf(
577+
output=self.output,
578+
doc_path_set=doc_path_set,
579+
format='pdf',
580+
delete=self.delete,
581+
page_ranges=page_ranges
582+
)
583+
562584
def concat(self, file_paths: dict, format: str) -> None:
563585
# Concatenate files of same type (img/movie/audio) back to back
564586
# Concatenate audio files
@@ -634,7 +656,7 @@ def concat(self, file_paths: dict, format: str) -> None:
634656
if file_paths[Category.DOCUMENT] and (
635657
format is None or format in self._supported_formats[Category.DOCUMENT]
636658
):
637-
pdf_out_path = os.path.join(self.output, "concatenated_pdfs.pdf")
659+
pdf_out_path = os.path.join(self.output, "concatenated.pdf")
638660
pdfs = sorted(
639661
[
640662
doc_path_set if doc_path_set[2] == "pdf" else None
@@ -737,7 +759,6 @@ def concat(self, file_paths: dict, format: str) -> None:
737759
f"[+] {lang.get_translation('concat_success', self.locale)}"
738760
)
739761

740-
741762
def merge(self, file_paths: dict, across: bool = False) -> None:
742763
# For movie files and equally named audio file, merge them together under same name
743764
# (movie with audio with '_merged' addition to name)

core/doc_converter.py

Lines changed: 132 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def to_subtitles(
179179
input_path = self.file_handler.join_back(movie_path_set)
180180
out_path = os.path.abspath(os.path.join(output, f"{movie_path_set[1]}.srt"))
181181
self.event_logger.info(
182-
f"[+] {lang.get_translation('extract_subtitles', self.locale)} '{input_path}'"
182+
f"[>] {lang.get_translation('extract_subtitles', self.locale)} '{input_path}'"
183183
)
184184
try:
185185
# Use FFmpeg to extract subtitles
@@ -200,7 +200,7 @@ def to_subtitles(
200200

201201
if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
202202
self.event_logger.info(
203-
f"[+] {lang.get_translation('subtitles_success', self.locale)} '{out_path}'"
203+
f"[>] {lang.get_translation('subtitles_success', self.locale)} '{out_path}'"
204204
)
205205
self.file_handler.post_process(
206206
movie_path_set, out_path, delete, show_status=False
@@ -217,7 +217,7 @@ def to_subtitles(
217217
)
218218
if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
219219
self.event_logger.info(
220-
f"[+] {lang.get_translation('embed_subtitles_success')} '{out_path}'"
220+
f"[>] {lang.get_translation('embed_subtitles_success')} '{out_path}'"
221221
)
222222
self.file_handler.post_process(
223223
movie_path_set, out_path, delete, show_status=False
@@ -400,3 +400,132 @@ def _place_img(page, img_path, full_page=False):
400400
doc.add_page_break()
401401
doc.save(out_path)
402402
self.file_handler.post_process(document_path_set, out_path, delete)
403+
404+
def split_pdf(self, output: str, doc_path_set: tuple, page_ranges: str, delete: bool, format: str = "pdf"):
405+
# output: str - output directory
406+
# doc_path_set: tuple - (path, name, format)
407+
# page_ranges: str - page ranges to split the document into
408+
# delete: bool - delete the original document after completion
409+
# format: str - format of the document
410+
doc_path = self.file_handler.join_back(doc_path_set)
411+
if format == "pdf":
412+
# Open the PDF document
413+
pdf = fitz.open(doc_path)
414+
total_pages = len(pdf)
415+
if total_pages == 0:
416+
pdf.close()
417+
return
418+
# Parse page_ranges into a list of tuples
419+
parsed_ranges = self._parse_page_ranges(page_ranges, total_pages)
420+
421+
# None means any splitting efforts would be futile, skip
422+
if parsed_ranges is None:
423+
pdf.close()
424+
return
425+
426+
# Create output directory if it doesn't exist
427+
os.makedirs(output, exist_ok=True)
428+
for i, (start, end) in enumerate(parsed_ranges):
429+
new_pdf = fitz.open()
430+
new_pdf.insert_pdf(pdf, from_page=start-1, to_page=end-1)
431+
# Generate output filename
432+
if len(parsed_ranges) == 1:
433+
out_filename = f"{doc_path_set[1]}_{start}-{end}.{format}"
434+
else:
435+
out_filename = f"{doc_path_set[1]}_split_{i+1}_{start}-{end}.{format}"
436+
out_path = os.path.abspath(os.path.join(output, out_filename))
437+
# Save new PDF
438+
new_pdf.save(out_path)
439+
new_pdf.close()
440+
self.event_logger.info(f"[>] {lang.get_translation('split_produced', self.locale)}: {out_path}")
441+
# Close source PDF
442+
pdf.close()
443+
self.file_handler.post_process(doc_path_set, out_path, delete)
444+
445+
def _parse_page_ranges(self, page_ranges: str, total_pages: int) -> list[tuple[int, int]] | None:
446+
"""
447+
Parse page_ranges string into list of (start, end) tuples.
448+
Handles: None, '1-5', '2-5,3-4', '3-6, 8-20, 23-45, rest', 'all', 'rest',
449+
'3-6;15-22', '1-7;8-15;20-22;rest', '3', '35', '12-end', '2-end',
450+
'2-5, 3-6, 12-end', '2-end, 3-6, 12-end', '2-end, 3-6'
451+
"""
452+
if not page_ranges or page_ranges.strip() == "":
453+
# Default to all pages
454+
return None #[(1, total_pages)], we don't need that, skip that
455+
page_ranges = page_ranges.strip()
456+
if page_ranges.lower() in ["all", "rest"]:
457+
return None #[(1, total_pages)], we don't need that, skip that
458+
459+
ranges = []
460+
for delim in [',', ';']:
461+
# Split by comma or semicolon
462+
if delim in page_ranges:
463+
ranges = [r.strip() for r in page_ranges.split(delim)]
464+
break
465+
if not ranges:
466+
# No delimiters found
467+
ranges = [page_ranges.strip()]
468+
469+
parsed_ranges = []
470+
# Track where 'rest' should start
471+
rest_start = 1
472+
473+
for range_str in ranges:
474+
range_str = range_str.strip()
475+
if range_str.lower() == "rest":
476+
# 'rest': from highest processed page + 1 to end
477+
if parsed_ranges:
478+
# Find highest end page processed so far
479+
max_end = max(end for _, end in parsed_ranges)
480+
rest_start = max_end + 1
481+
parsed_ranges.append((rest_start, total_pages))
482+
continue
483+
484+
# Handle single page numbers
485+
if range_str.isdigit():
486+
page_num = int(range_str)
487+
if 1 <= page_num <= total_pages:
488+
parsed_ranges.append((page_num, page_num))
489+
rest_start = max(rest_start, page_num + 1)
490+
continue
491+
492+
# Handle ranges like "1-5", "12-end", "2-end"
493+
if '-' in range_str:
494+
parts = range_str.split('-', 1)
495+
if len(parts) == 2:
496+
start_str, end_str = parts[0].strip(), parts[1].strip()
497+
498+
# Parse start
499+
try:
500+
start = int(start_str)
501+
except ValueError:
502+
continue # Some invalid range
503+
504+
# Parse end
505+
if end_str.lower() == "end":
506+
end = total_pages
507+
else:
508+
try:
509+
end = int(end_str)
510+
except ValueError:
511+
continue # Some invalid range
512+
513+
# Validate, add range
514+
if 1 <= start <= total_pages and 1 <= end <= total_pages and start <= end:
515+
parsed_ranges.append((start, end))
516+
rest_start = max(rest_start, end + 1)
517+
518+
# Remove duplicate range tuples, preserve order
519+
# Ranges still are allowed to overlap, must each be unique though
520+
seen = set()
521+
unique_ranges = []
522+
for range_tuple in parsed_ranges:
523+
if range_tuple not in seen:
524+
seen.add(range_tuple)
525+
unique_ranges.append(range_tuple)
526+
527+
if not unique_ranges:
528+
# If no valid ranges parsed, default to all pages, which is None
529+
unique_ranges = None
530+
531+
return unique_ranges

core/movie_converter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ def to_protocol(
349349
exist_ok=True,
350350
)
351351
self.event_logger.info(
352-
f"[+] {lang.get_translation('get_hls', self.locale)} {self.file_handler.join_back(movie_path_set)}: {resolution} at {v_bitrate} video, {a_bitrate} audio"
352+
f"[>] {lang.get_translation('get_hls', self.locale)} {self.file_handler.join_back(movie_path_set)}: {resolution} at {v_bitrate} video, {a_bitrate} audio"
353353
)
354354
stream = [
355355
"-map",
@@ -383,7 +383,7 @@ def to_protocol(
383383
cmd += stream
384384
variant_playlist += f"#EXT-X-STREAM-INF:BANDWIDTH={int(v_bitrate[:-1]) * 1000},RESOLUTION={resolution}\n{i}.m3u8\n"
385385
self.event_logger.info(
386-
f"[+] {lang.get_translation('get_hls_master', self.locale)} {self.file_handler.join_back(movie_path_set)}"
386+
f"[>] {lang.get_translation('get_hls_master', self.locale)} {self.file_handler.join_back(movie_path_set)}"
387387
)
388388
master_playlist_path = os.path.join(current_out_dir, "master.m3u8")
389389

@@ -402,7 +402,7 @@ def to_protocol(
402402
)
403403
elif protocol[0] == "dash":
404404
self.event_logger.info(
405-
f"[+] {lang.get_translation('create_dash', self.locale)} {self.file_handler.join_back(movie_path_set)}"
405+
f"[>] {lang.get_translation('create_dash', self.locale)} {self.file_handler.join_back(movie_path_set)}"
406406
)
407407
out_path = os.path.join(current_out_dir, "manifest.mpd")
408408
cmd = [

core/utils/file_handler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def post_process(
2828
# Only log if the conversion was successful and output exists
2929
if show_status and os.path.exists(out_path):
3030
self.event_logger.info(
31-
f"[+] {lang.get_translation('converted', self.locale)} "
31+
f"[>] {lang.get_translation('converted', self.locale)} "
3232
f'"{source_path}" 🡢 "{out_path}"'
3333
)
3434

tests/test_cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def test_blank_start_no_files_in_cli_output(controller_instance, caplog):
3838
None,
3939
None,
4040
None,
41+
None,
4142
False,
4243
False,
4344
False,

0 commit comments

Comments
 (0)