|
8 | 8 | from typing import List |
9 | 9 | from uuid import uuid4 |
10 | 10 |
|
11 | | -from redact import redact_image, redact_fott_label, redact_ocr_result |
| 11 | +from redact import redact_fott_label, redact_ocr_result, redact_file_bundle |
12 | 12 | from redact.io.blob_reader import BlobReader |
13 | 13 | from redact.io.blob_writer import BlobWriter |
14 | 14 | from redact.io.local_reader import LocalReader |
15 | 15 | from redact.io.local_writer import LocalWriter |
16 | 16 | from redact.utils.file_name import get_redacted_file_name, valid_url |
17 | | -from redact.utils.pdf_renderer import PdfRenderer |
18 | 17 | from redact.types.file_bundle import FileType, FileBundle |
19 | | -from redact.types.pre_processing_bundle import PdfPreProcessingBundle |
| 18 | +from redact.preprocess import preprocess_multi_page_bundle |
20 | 19 |
|
21 | 20 |
|
22 | 21 | # Strong Assumption: assume all valid URLs are Azure Blob URL. |
23 | 22 | def is_blob_url(url: str) -> bool: |
24 | 23 | return valid_url(url) |
25 | 24 |
|
26 | 25 |
|
27 | | -def process_pdf_bundle(file_bundles: List[FileBundle], fields_to_redact: List[str]): |
28 | | - renderer = PdfRenderer() |
29 | | - |
30 | | - for file_bundle in file_bundles: |
31 | | - pdf_pre_processing_bundle = PdfPreProcessingBundle.from_file_bundle(file_bundle) |
32 | | - |
33 | | - redacted_image_name = get_redacted_file_name(pdf_pre_processing_bundle.rendered_file_name) |
34 | | - redacted_fott_name = get_redacted_file_name(file_bundle.fott_file_name) |
35 | | - redacted_ocr_name = get_redacted_file_name(file_bundle.ocr_file_name) |
36 | | - |
37 | | - # Render PDF |
38 | | - renderer.render_pdf_and_save( |
39 | | - Path(build_pre_processing_folder, file_bundle.image_file_name), |
40 | | - Path(build_pre_processing_folder, pdf_pre_processing_bundle.rendered_file_name), |
41 | | - target_pdf_render_dpi) |
42 | | - |
43 | | - # Follow the regular redaction process with taking files from slightly different source folders |
44 | | - redact_image( |
45 | | - Path(build_pre_processing_folder, pdf_pre_processing_bundle.rendered_file_name), |
46 | | - Path(build_pre_processing_folder, file_bundle.fott_file_name), |
47 | | - Path(build_output_folder, redacted_image_name), |
48 | | - fields_to_redact) |
49 | | - redact_fott_label( |
50 | | - Path(build_pre_processing_folder, file_bundle.fott_file_name), |
51 | | - Path(build_output_folder, redacted_fott_name), |
52 | | - fields_to_redact) |
53 | | - redact_ocr_result( |
54 | | - Path(build_pre_processing_folder, file_bundle.ocr_file_name), |
55 | | - Path(build_pre_processing_folder, file_bundle.fott_file_name), |
56 | | - Path(build_output_folder, redacted_ocr_name), |
57 | | - fields_to_redact) |
58 | | - |
59 | | -if __name__ == '__main__': |
| 26 | +if __name__ == "__main__": |
60 | 27 | input_container = sys.argv[1] |
61 | 28 | input_path = sys.argv[2] |
62 | 29 | output_container = sys.argv[3] |
63 | 30 | output_path = sys.argv[4] |
| 31 | + api_version = sys.argv[5] |
64 | 32 | target_pdf_render_dpi = 300 |
65 | | - fields_to_redact = [] |
| 33 | + fields_to_redact = tuple() |
66 | 34 |
|
67 | | - if len(sys.argv) >= 6: |
68 | | - fields_to_redact = (sys.argv[5].split(',')) |
| 35 | + if len(sys.argv) >= 7: |
| 36 | + fields_to_redact = sys.argv[6].split(",") |
69 | 37 |
|
70 | 38 | # Random generated UUID in the build folder name for preventing collapse. |
71 | | - build_path = Path(f'build-{uuid4()}/') |
72 | | - build_pre_processing_folder = Path(build_path, "pre/") |
| 39 | + build_path = Path(f"build-{uuid4()}/") |
| 40 | + build_pre_folder = Path(build_path, "pre/") |
73 | 41 | build_input_folder = Path(build_path, "in/") |
74 | 42 | build_output_folder = Path(build_path, "out/") |
75 | | - Path(build_pre_processing_folder).mkdir(parents=True, exist_ok=True) |
| 43 | + Path(build_pre_folder).mkdir(parents=True, exist_ok=True) |
76 | 44 | Path(build_input_folder).mkdir(parents=True, exist_ok=True) |
77 | 45 | Path(build_output_folder).mkdir(parents=True, exist_ok=True) |
| 46 | + |
78 | 47 | try: |
79 | 48 | file_bundle_list = None |
80 | | - pdf_file_bundle_list = None |
| 49 | + multi_page_bundle_list = None |
81 | 50 | if is_blob_url(input_container): |
82 | 51 | reader = BlobReader(input_container, input_path) |
83 | | - pdf_file_bundle_list = reader.download_bundles(to=build_pre_processing_folder, mode=FileType.PDF_ONLY) |
| 52 | + multi_page_bundle_list = reader.download_bundles( |
| 53 | + to=build_pre_folder, mode=FileType.MULTI_PAGE |
| 54 | + ) |
84 | 55 | file_bundle_list = reader.download_bundles(to=build_input_folder) |
85 | 56 | else: |
86 | 57 | reader = LocalReader(input_path) |
87 | | - pdf_file_bundle_list = reader.copy_bundles(to=build_pre_processing_folder, mode=FileType.PDF_ONLY) |
| 58 | + multi_page_bundle_list = reader.copy_bundles( |
| 59 | + to=build_pre_folder, mode=FileType.MULTI_PAGE |
| 60 | + ) |
88 | 61 | file_bundle_list = reader.copy_bundles(to=build_input_folder) |
89 | 62 |
|
| 63 | + per_page_bundle_list: List[FileBundle] = [] |
| 64 | + |
| 65 | + # Render and process PDF/TIFF files if any. |
| 66 | + if multi_page_bundle_list is not None: |
| 67 | + for fb in multi_page_bundle_list: |
| 68 | + bundle_list = preprocess_multi_page_bundle( |
| 69 | + fb, build_pre_folder, build_input_folder, target_pdf_render_dpi |
| 70 | + ) |
| 71 | + per_page_bundle_list.extend(bundle_list) |
| 72 | + |
| 73 | + # Short path: preprocess folder -> output folder. |
| 74 | + # We still need to redact the full label file. |
| 75 | + redact_fott_label( |
| 76 | + Path(build_pre_folder, fb.fott_file_name), |
| 77 | + Path( |
| 78 | + build_output_folder, get_redacted_file_name(fb.fott_file_name) |
| 79 | + ), |
| 80 | + fields_to_redact, |
| 81 | + ) |
| 82 | + |
| 83 | + # We still need to redact the full ocr file. |
| 84 | + redact_ocr_result( |
| 85 | + Path(build_pre_folder, fb.ocr_file_name), |
| 86 | + Path(build_pre_folder, fb.fott_file_name), |
| 87 | + Path(build_output_folder, get_redacted_file_name(fb.ocr_file_name)), |
| 88 | + api_version, |
| 89 | + fields_to_redact, |
| 90 | + ) |
| 91 | + |
| 92 | + # Process images and per page result from multi-page documents. |
| 93 | + file_bundle_list.extend(per_page_bundle_list) |
90 | 94 | for fb in file_bundle_list: |
91 | | - redacted_image_name = get_redacted_file_name(fb.image_file_name) |
92 | | - redacted_fott_name = get_redacted_file_name(fb.fott_file_name) |
93 | | - redacted_ocr_name = get_redacted_file_name(fb.ocr_file_name) |
94 | | - |
95 | | - redact_image( |
96 | | - Path(build_input_folder, fb.image_file_name), |
97 | | - Path(build_input_folder, fb.fott_file_name), |
98 | | - Path(build_output_folder, redacted_image_name), |
99 | | - fields_to_redact) |
100 | | - redact_fott_label( |
101 | | - Path(build_input_folder, fb.fott_file_name), |
102 | | - Path(build_output_folder, redacted_fott_name), |
103 | | - fields_to_redact) |
104 | | - redact_ocr_result( |
105 | | - Path(build_input_folder, fb.ocr_file_name), |
106 | | - Path(build_input_folder, fb.fott_file_name), |
107 | | - Path(build_output_folder, redacted_ocr_name), |
108 | | - fields_to_redact) |
109 | | - |
110 | | - # Render and process PDF files if any |
111 | | - if pdf_file_bundle_list is not None: |
112 | | - process_pdf_bundle(pdf_file_bundle_list, fields_to_redact) |
| 95 | + redact_file_bundle( |
| 96 | + fb, |
| 97 | + build_input_folder, |
| 98 | + build_output_folder, |
| 99 | + api_version, |
| 100 | + fields_to_redact, |
| 101 | + ) |
113 | 102 |
|
114 | 103 | if is_blob_url(output_container): |
115 | 104 | writer = BlobWriter(output_container, output_path) |
|
0 commit comments