Skip to content

Commit 22f954e

Browse files
committed
feat(nbs): multi-frame image pre-processing
Add support for multi-page image-like files (e.g. TIFF) to pre- processing notebook's clean_dataset_for_img_ocr() function. For now, we still convert TIFF pages to PNGs by default because the SageMaker Ground Truth Bounding Box UI seems to be struggling with them in the sample docs we tested.
1 parent b6337d0 commit 22f954e

File tree

2 files changed

+69
-29
lines changed

2 files changed

+69
-29
lines changed

notebooks/1. Data Preparation.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,7 @@
674674
" # Default params:\n",
675675
" # pdf_dpi=300,\n",
676676
" # pdf_image_format=\"png\",\n",
677-
" # textract_compatible_formats=(\"jpg\", \"jpeg\", \"png\"),\n",
677+
" # allowed_formats=(\"jpg\", \"jpeg\", \"png\"),\n",
678678
" # preferred_image_format=\"png\",\n",
679679
")"
680680
]

notebooks/util/preproc.py

Lines changed: 68 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,19 @@ def s3_object_exists(bucket_name: str, key: str) -> bool:
5151
raise e
5252

5353

54+
def get_exif_tag_id_by_name(name: str) -> Optional[str]:
55+
"""Find a numeric EXIF tag ID by common name
56+
57+
As per https://pillow.readthedocs.io/en/stable/reference/ExifTags.html
58+
"""
59+
try:
60+
return next(
61+
k for k in ExifTags.TAGS.keys() if ExifTags.TAGS[k] == name
62+
)
63+
except StopIteration:
64+
return None
65+
66+
5467
class ImageExtractionResult:
5568
"""Result descriptor for extracting a source image/document to image(s)"""
5669

@@ -66,7 +79,7 @@ def clean_dataset_for_img_ocr(
6679
filepaths: Optional[Iterable[str]] = None,
6780
pdf_dpi: int = 300,
6881
pdf_image_format: str = "png",
69-
textract_compatible_formats: Iterable[str] = ("jpg", "jpeg", "png"),
82+
allowed_formats: Iterable[str] = ("jpg", "jpeg", "png"),
7083
preferred_image_format: str = "png",
7184
) -> List[ImageExtractionResult]:
7285
"""Process a mixed PDF/image dataset for use with SageMaker Ground Truth image task UIs
@@ -86,9 +99,12 @@ def clean_dataset_for_img_ocr(
8699
DPI resolution to extract images from PDFs (Default 300).
87100
pdf_image_format : str
88101
Format to extract images from PDFs (Default 'png').
89-
textract_compatible_formats : Iterable[str]
90-
The set of compatible file formats for Textract: Used to determine whether to convert
102+
allowed_formats : Iterable[str]
103+
The set of permitted file formats for compatibility: Used to determine whether to convert
91104
source images in other formats which PIL may still have been able to successfully load.
105+
NOTE: Amazon Textract also supports 'tiff', but we left it out of the default list because
106+
TIFF images seemed to break the SageMaker Ground Truth built-in bounding box UI as of some
107+
tests in 2021-10. Default ('jpg', 'jpeg', 'png').
92108
preferred_image_format : str
93109
Format to be used when an image has been saved/converted (Default 'png').
94110
"""
@@ -104,7 +120,7 @@ def clean_dataset_for_img_ocr(
104120
(os.path.join(path, f) for path, _, files in os.walk(from_path) for f in files),
105121
)
106122
)
107-
n_files_total = len(filepaths)
123+
ORIENTATION_EXIF_ID = get_exif_tag_id_by_name("Orientation")
108124
os.makedirs(to_path, exist_ok=True)
109125

110126
for filepath in tqdm(filepaths, desc="Processing input files...", unit="file"):
@@ -144,20 +160,27 @@ def clean_dataset_for_img_ocr(
144160
"\n - ".join(result.cleanpaths),
145161
)
146162
)
147-
else:
148-
try:
149-
image = PIL.Image.open(filepath)
150-
except PIL.UnidentifiedImageError:
151-
logger.warning(f"* Ignoring incompatible file: {filepath}")
152-
continue
163+
continue # PDF processed successfully
164+
165+
try:
166+
image = PIL.Image.open(filepath)
167+
except PIL.UnidentifiedImageError:
168+
logger.warning(f"* Ignoring incompatible file: {filepath}")
169+
continue # Skip file
170+
171+
# Some "image" formats (notably TIFF) support multiple pages as "frames":
172+
n_image_pages = getattr(image, "n_frames", 1)
173+
if n_image_pages > 1:
174+
logger.info("Extracting %s pages from file %s", n_image_pages, filepath)
175+
convert_format = not ext_lower in allowed_formats
176+
outpaths = []
177+
for ixpage in range(n_image_pages):
178+
if n_image_pages > 1:
179+
image.seek(ixpage)
153180

154181
# Correct orientation from EXIF data:
155-
for orientation in ExifTags.TAGS.keys():
156-
if ExifTags.TAGS[orientation] == "Orientation":
157-
break
158-
exif = dict((image._getexif() or {}).items())
159-
img_orientation = exif.get(orientation)
160-
logger.info("Image {} has orientation {}".format(filepath, img_orientation))
182+
exif = dict((image.getexif() or {}).items())
183+
img_orientation = exif.get(ORIENTATION_EXIF_ID)
161184
if img_orientation == 3:
162185
image = image.rotate(180, expand=True)
163186
rotated = True
@@ -170,21 +193,38 @@ def clean_dataset_for_img_ocr(
170193
else:
171194
rotated = False
172195

173-
if ext_lower not in textract_compatible_formats:
174-
outpath = os.path.join(outfolder, f"{basename}.{preferred_image_format}")
175-
image.save(outpath)
176-
logger.info(f"* Converted image {filepath} to {outpath}")
177-
elif rotated:
196+
if n_image_pages == 1 and not (convert_format or rotated):
197+
# Special case where image file can just be copied across:
178198
outpath = os.path.join(outfolder, filename)
179-
image.save(outpath)
180-
logger.info(f"* Rotated image {filepath} to {outpath}")
199+
shutil.copy2(filepath, outpath)
181200
else:
182-
outpath = os.path.join(outfolder, filename)
201+
outpath = os.path.join(
202+
outfolder,
203+
"".join((
204+
basename,
205+
"-%04i" % (ixpage + 1) if n_image_pages > 1 else "",
206+
".",
207+
preferred_image_format if convert_format else ext,
208+
)),
209+
)
210+
image.save(outpath)
183211

184-
shutil.copy2(filepath, outpath)
185-
logger.info(f"* Copied file {filepath} to {outpath}")
186-
result.cleanpaths = [outpath]
187-
results.append(result)
212+
outpaths.append(outpath)
213+
logger.info(
214+
"* %s image %s%s (orientation %s) to %s",
215+
"Converted" if convert_format else (
216+
"Rotated" if rotated else (
217+
"Extracted" if n_image_pages > 1 else "Copied"
218+
)
219+
),
220+
filepath,
221+
f" page {ixpage + 1}" if n_image_pages > 1 else "",
222+
img_orientation,
223+
outpath,
224+
)
225+
226+
result.cleanpaths = outpaths
227+
results.append(result)
188228

189229
logger.info("Done!")
190230
return results

0 commit comments

Comments
 (0)