microsoft
diff --git a/‎scripts/redact_cli_py/.flake8‎
Lines changed: 3 additions & 0 deletions b/‎scripts/redact_cli_py/.flake8‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎scripts/redact_cli_py/CHANGELOG.md‎
Lines changed: 34 additions & 0 deletions b/‎scripts/redact_cli_py/CHANGELOG.md‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎scripts/redact_cli_py/Pipfile‎
Lines changed: 6 additions & 0 deletions b/‎scripts/redact_cli_py/Pipfile‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎scripts/redact_cli_py/Pipfile.lock‎
Lines changed: 419 additions & 205 deletions b/‎scripts/redact_cli_py/Pipfile.lock‎
Lines changed: 419 additions & 205 deletions
diff --git a/‎scripts/redact_cli_py/README.md‎
Lines changed: 38 additions & 9 deletions b/‎scripts/redact_cli_py/README.md‎
Lines changed: 38 additions & 9 deletions
diff --git a/‎scripts/redact_cli_py/batch_redact.py‎
Lines changed: 56 additions & 67 deletions b/‎scripts/redact_cli_py/batch_redact.py‎
Lines changed: 56 additions & 67 deletions
diff --git a/‎scripts/redact_cli_py/redact.py‎
Lines changed: 17 additions & 12 deletions b/‎scripts/redact_cli_py/redact.py‎
Lines changed: 17 additions & 12 deletions
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203, E501, PIE798
@@ -6,6 +6,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.2] - 2022-08-11
+### Changed
+- Refactor code styles with flake8/black and their extensions.
+
+## [0.3.1] - 2022-08-02
+### Added
+- Support to multi page PDFs and TIFFs in batch redact CLI (`batch_redact.py`)
+
+## [0.3.0] - 2022-01-06
+### Added
+- Support to FormRecognizer OCR Result v3.0 format while still maintaining the backward compatibility to v2.0 and v2.1.
+
+### Changed
+- The default API version of OCR result redaction has changed from v2.x to v3.x schema.
+- You now need to specified which version of the OCR result you want to redact in `redact.py` and `batch_redact.py`.
+  - Before:
+
+  ``` bash
+  python redact.py ocr <ocr_result_path> <fott_label_path> <output_path>
+  python batch_redact.py <input_container> <input_folder_path> <output_container> <output_folder_path>
+  ```
+
+  - After:
+
+  ``` bash
+  python redact.py ocr <ocr_result_path> <fott_label_path> <output_path> <api_version>
+  python batch_redact.py <input_container> <input_folder_path> <output_container> <output_folder_path> <api_version>
+  ```
+
+  Where API Version is one of the following:
+  - v2.0
+  - v2.1
+  - v3.0
+
 ## [0.2.3] - 2021-12-13
 ### Added
 - Support to redact some Latin ligature letters and letters with diacritics.
 
@@ -10,6 +10,12 @@ shapely = "*"
 dacite = "*"
 azure-storage-blob = "*"
 pypdfium = "*"
+flake8 = "*"
+black = "*"
+flake8-bugbear = "*"
+flake8-pie = "*"
+pep8-naming = "*"
+flake8-black = "*"
 
 [dev-packages]
 pytest = "*"
 
@@ -10,11 +10,11 @@ The OCR.json and labels.json will also be redacted while keeping the semantics o
 ![ocr-before-after-redaction](./images/ocr-before-after-redaction.png)
 ![labels-before-after-redaction](./images/labels-before-after-redaction.png)
 
-## Language support
+## Language Support
 This tool supports Latin characters redaction only. For any non-Latin document support, please [contact us](mailto:formrecog_contact@microsoft.com?subject=Redaction%20tool%20language%20support).
 
 ## Version
-Redact CLI 0.2.3
+Redact CLI 0.3.2
 
 ## Setup Environment
 
@@ -103,7 +103,21 @@ python redact.py image <image_path> <fott_label_path> <output_path>
 ### Redact OCR Result
 
 ``` bash
-python redact.py ocr <ocr_result_path> <fott_label_path> <output_path>
+python redact.py ocr <ocr_result_path> <fott_label_path> <output_path> <api_version>
+```
+
+#### API Version
+
+In Azure Form Recognizer, The OCR result for different API version has different schema. To successfully redact the OCR result, you must give one of the `<api_version>` to the redaction toolkit.
+
+- v2.0
+- v2.1
+- v3.0
+
+For example,
+
+``` bash
+python redact.py ocr sample.ocr.json sample.labels.json redacted_sample.ocr.json "v3.0"
 ```
 
 ### Redact FOTT Label Path
@@ -113,6 +127,7 @@ python redact.py fott <fott_label_path> <output_path>
 ```
 
 ### Redact specific labels from Image, OCR results or FOTT Label Path
+
 In some specific use-cases, the need may arise to redact specific labels from an image, OCR results or/and FOTT Label Path.
 Labels to be redacted need to provided together in a string separated by commas.
 
@@ -127,17 +142,17 @@ And _Label_01_ and _Label_04_ need to be redacted, the following commands can be
 #### Redact specific labels from Image
 
 ``` bash
-python redact.py image <fott_label_path> <output_path> "Label_01,Label_04"
+python redact.py image <fott_label_path> <output_path> <api_version> "Label_01,Label_04"
 ```
 #### Redact specific labels from OCR Result
 
 ``` bash
-python redact.py ocr <ocr_result_path> <image_path> <fott_label_path> <output_path> "Label_01,Label_04"
+python redact.py ocr <ocr_result_path> <image_path> <fott_label_path> <output_path> <api_version> "Label_01,Label_04"
 ```
 #### Redact specific labels from FOTT Label Path
 
 ``` bash
-python redact.py image <image_path> <fott_label_path> <output_path> "Label_01,Label_04"
+python redact.py image <image_path> <fott_label_path> <output_path> <api_version> "Label_01,Label_04"
 ```
 
 ### Batch Redaction
@@ -146,7 +161,7 @@ Batch redaction supports redacting a folder rather than executing on a single fi
 2. Azure Blob Storage virtual folder: a URL to a Blob Storage container and a folder path to denotes the folder.
 
 ``` bash
-python batch_redact.py <input_container> <input_folder_path> <output_container> <output_folder_path>
+python batch_redact.py <input_container> <input_folder_path> <output_container> <output_folder_path> <api_version>
 ```
 
 #### Container
@@ -176,12 +191,16 @@ python batch_redact.py local raw/ "https://my.blob.account/data?<my_secret_SAS_t
 python batch_redact.py "https://my.blob.account/data?<my_secret_SAS_token>" folder1/ "https://my.blob.account/data?<my_secret_SAS_token>" folder2/
 ```
 
-#### Note
+---
+
+**NOTE**
 
 1. Surround the URL with double quotes to prevent wrong character escape in the SAS token.
 2. Visit [Create Your SAS tokens with Azure Storage Explorer](https://docs.microsoft.com/en-us/azure/cognitive-services/translator/document-translation/create-sas-tokens?tabs=Containers) to see how to create a SAS token for this program to use.
 3. Currently, this redact CLI only support ASCII character redaction (Latin alphabets without the accent marks).
 
+---
+
 #### PDF Support
 
 Batch mode now supports redacting data from one-page PDF documents. The tool will detect any PDF document in the input folder, convert to an image (.png) and redact the image itself placing it in the specified output folder upon completion.
@@ -204,7 +223,17 @@ pytest
 
 in the root folder.
 
-### Note
+---
+
+**NOTE**
 
 1. You can also take a look at the `redact/__init__.py` file. The command line interface (CLI) is just a thin wrapper on `redact_image()`, `redact_ocr_result()`, and `redact_fott_label()`. You could extend the code on top of the three functions for achieving your own goal, such as to redact a batch of data.
 2. For batch redaction, we currently only support `.jpeg`, `.jpg`, `.png`, `.tif`, `.tiff`, and `.bmp` as the file extension for images. PDF files are not supported.
+
+---
+
+## References
+
+- [Form Recognizer API v2.0](https://westus2.dev.cognitive.microsoft.com/docs/services/form-recognizer-api-v2/operations/AnalyzeWithCustomForm)
+- [Form Recognizer API v2.1](https://westus.dev.cognitive.microsoft.com/docs/services/form-recognizer-api-v2-1/operations/AnalyzeWithCustomForm)
+- [Form Recognizer API v3.0](https://westus.dev.cognitive.microsoft.com/docs/services/form-recognizer-api-2022-08-31/operations/GetAnalyzeDocumentResult)
@@ -8,108 +8,97 @@
 from typing import List
 from uuid import uuid4
 
-from redact import redact_image, redact_fott_label, redact_ocr_result
+from redact import redact_fott_label, redact_ocr_result, redact_file_bundle
 from redact.io.blob_reader import BlobReader
 from redact.io.blob_writer import BlobWriter
 from redact.io.local_reader import LocalReader
 from redact.io.local_writer import LocalWriter
 from redact.utils.file_name import get_redacted_file_name, valid_url
-from redact.utils.pdf_renderer import PdfRenderer
 from redact.types.file_bundle import FileType, FileBundle
-from redact.types.pre_processing_bundle import PdfPreProcessingBundle
+from redact.preprocess import preprocess_multi_page_bundle
 
 
 # Strong Assumption: assume all valid URLs are Azure Blob URL.
 def is_blob_url(url: str) -> bool:
     return valid_url(url)
 
 
-def process_pdf_bundle(file_bundles: List[FileBundle], fields_to_redact: List[str]):
-    renderer = PdfRenderer()
-
-    for file_bundle in file_bundles:
-        pdf_pre_processing_bundle = PdfPreProcessingBundle.from_file_bundle(file_bundle)
-
-        redacted_image_name = get_redacted_file_name(pdf_pre_processing_bundle.rendered_file_name)
-        redacted_fott_name = get_redacted_file_name(file_bundle.fott_file_name)
-        redacted_ocr_name = get_redacted_file_name(file_bundle.ocr_file_name)
-
-        # Render PDF
-        renderer.render_pdf_and_save(
-            Path(build_pre_processing_folder, file_bundle.image_file_name),
-            Path(build_pre_processing_folder, pdf_pre_processing_bundle.rendered_file_name),
-            target_pdf_render_dpi)
-
-        # Follow the regular redaction process with taking files from slightly different source folders
-        redact_image(
-            Path(build_pre_processing_folder, pdf_pre_processing_bundle.rendered_file_name),
-            Path(build_pre_processing_folder, file_bundle.fott_file_name),
-            Path(build_output_folder, redacted_image_name),
-            fields_to_redact)
-        redact_fott_label(
-            Path(build_pre_processing_folder, file_bundle.fott_file_name),
-            Path(build_output_folder, redacted_fott_name),
-            fields_to_redact)
-        redact_ocr_result(
-            Path(build_pre_processing_folder, file_bundle.ocr_file_name),
-            Path(build_pre_processing_folder, file_bundle.fott_file_name),
-            Path(build_output_folder, redacted_ocr_name),
-            fields_to_redact)
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     input_container = sys.argv[1]
     input_path = sys.argv[2]
     output_container = sys.argv[3]
     output_path = sys.argv[4]
+    api_version = sys.argv[5]
     target_pdf_render_dpi = 300
-    fields_to_redact = []
+    fields_to_redact = tuple()
 
-    if len(sys.argv) >= 6:
-        fields_to_redact = (sys.argv[5].split(','))
+    if len(sys.argv) >= 7:
+        fields_to_redact = sys.argv[6].split(",")
 
     # Random generated UUID in the build folder name for preventing collapse.
-    build_path = Path(f'build-{uuid4()}/')
-    build_pre_processing_folder = Path(build_path, "pre/")
+    build_path = Path(f"build-{uuid4()}/")
+    build_pre_folder = Path(build_path, "pre/")
     build_input_folder = Path(build_path, "in/")
     build_output_folder = Path(build_path, "out/")
-    Path(build_pre_processing_folder).mkdir(parents=True, exist_ok=True)
+    Path(build_pre_folder).mkdir(parents=True, exist_ok=True)
     Path(build_input_folder).mkdir(parents=True, exist_ok=True)
     Path(build_output_folder).mkdir(parents=True, exist_ok=True)
+
     try:
         file_bundle_list = None
-        pdf_file_bundle_list = None
+        multi_page_bundle_list = None
         if is_blob_url(input_container):
             reader = BlobReader(input_container, input_path)
-            pdf_file_bundle_list = reader.download_bundles(to=build_pre_processing_folder, mode=FileType.PDF_ONLY)
+            multi_page_bundle_list = reader.download_bundles(
+                to=build_pre_folder, mode=FileType.MULTI_PAGE
+            )
             file_bundle_list = reader.download_bundles(to=build_input_folder)
         else:
             reader = LocalReader(input_path)
-            pdf_file_bundle_list = reader.copy_bundles(to=build_pre_processing_folder, mode=FileType.PDF_ONLY)
+            multi_page_bundle_list = reader.copy_bundles(
+                to=build_pre_folder, mode=FileType.MULTI_PAGE
+            )
             file_bundle_list = reader.copy_bundles(to=build_input_folder)
 
+        per_page_bundle_list: List[FileBundle] = []
+
+        # Render and process PDF/TIFF files if any.
+        if multi_page_bundle_list is not None:
+            for fb in multi_page_bundle_list:
+                bundle_list = preprocess_multi_page_bundle(
+                    fb, build_pre_folder, build_input_folder, target_pdf_render_dpi
+                )
+                per_page_bundle_list.extend(bundle_list)
+
+                # Short path: preprocess folder -> output folder.
+                # We still need to redact the full label file.
+                redact_fott_label(
+                    Path(build_pre_folder, fb.fott_file_name),
+                    Path(
+                        build_output_folder, get_redacted_file_name(fb.fott_file_name)
+                    ),
+                    fields_to_redact,
+                )
+
+                # We still need to redact the full ocr file.
+                redact_ocr_result(
+                    Path(build_pre_folder, fb.ocr_file_name),
+                    Path(build_pre_folder, fb.fott_file_name),
+                    Path(build_output_folder, get_redacted_file_name(fb.ocr_file_name)),
+                    api_version,
+                    fields_to_redact,
+                )
+
+        # Process images and per page result from multi-page documents.
+        file_bundle_list.extend(per_page_bundle_list)
         for fb in file_bundle_list:
-            redacted_image_name = get_redacted_file_name(fb.image_file_name)
-            redacted_fott_name = get_redacted_file_name(fb.fott_file_name)
-            redacted_ocr_name = get_redacted_file_name(fb.ocr_file_name)
-
-            redact_image(
-                Path(build_input_folder, fb.image_file_name),
-                Path(build_input_folder, fb.fott_file_name),
-                Path(build_output_folder, redacted_image_name),
-                fields_to_redact)
-            redact_fott_label(
-                Path(build_input_folder, fb.fott_file_name),
-                Path(build_output_folder, redacted_fott_name),
-                fields_to_redact)
-            redact_ocr_result(
-                Path(build_input_folder, fb.ocr_file_name),
-                Path(build_input_folder, fb.fott_file_name),
-                Path(build_output_folder, redacted_ocr_name),
-                fields_to_redact)
-
-        # Render and process PDF files if any
-        if pdf_file_bundle_list is not None:
-            process_pdf_bundle(pdf_file_bundle_list, fields_to_redact)
+            redact_file_bundle(
+                fb,
+                build_input_folder,
+                build_output_folder,
+                api_version,
+                fields_to_redact,
+            )
 
         if is_blob_url(output_container):
             writer = BlobWriter(output_container, output_path)
 
@@ -6,30 +6,35 @@
 from redact import redact_image, redact_fott_label, redact_ocr_result
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     operator = sys.argv[1]
 
-    if operator == 'image':
-        labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(',')
+    if operator == "image":
+        labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(",")
         redact_image(
             image_path=sys.argv[2],
             fott_label_path=sys.argv[3],
             output_path=sys.argv[4],
-            labels_to_redact=labels_to_redact)
+            labels_to_redact=labels_to_redact,
+        )
 
-    elif operator == 'fott':
-        labels_to_redact = [] if len(sys.argv) < 5 else sys.argv[4].split(',')
-        redact_fott_label(fott_label_path=sys.argv[2],
-                          output_path=sys.argv[3],
-                          labels_to_redact=labels_to_redact)
+    elif operator == "fott":
+        labels_to_redact = [] if len(sys.argv) < 5 else sys.argv[4].split(",")
+        redact_fott_label(
+            fott_label_path=sys.argv[2],
+            output_path=sys.argv[3],
+            labels_to_redact=labels_to_redact,
+        )
 
-    elif operator == 'ocr':
-        labels_to_redact = [] if len(sys.argv) < 6 else sys.argv[5].split(',')
+    elif operator == "ocr":
+        labels_to_redact = [] if len(sys.argv) < 7 else sys.argv[6].split(",")
         redact_ocr_result(
             ocr_result_path=sys.argv[2],
             fott_label_path=sys.argv[3],
             output_path=sys.argv[4],
-            labels_to_redact=labels_to_redact)
+            api_version=sys.argv[5],
+            labels_to_redact=labels_to_redact,
+        )
 
     else:
         raise NameError()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[flake8]`
	`2`	`+max-line-length = 88`
	`3`	`+extend-ignore = E203, E501, PIE798`