@@ -51,6 +51,19 @@ def s3_object_exists(bucket_name: str, key: str) -> bool:
51
51
raise e
52
52
53
53
54
+ def get_exif_tag_id_by_name (name : str ) -> Optional [str ]:
55
+ """Find a numeric EXIF tag ID by common name
56
+
57
+ As per https://pillow.readthedocs.io/en/stable/reference/ExifTags.html
58
+ """
59
+ try :
60
+ return next (
61
+ k for k in ExifTags .TAGS .keys () if ExifTags .TAGS [k ] == name
62
+ )
63
+ except StopIteration :
64
+ return None
65
+
66
+
54
67
class ImageExtractionResult :
55
68
"""Result descriptor for extracting a source image/document to image(s)"""
56
69
@@ -66,7 +79,7 @@ def clean_dataset_for_img_ocr(
66
79
filepaths : Optional [Iterable [str ]] = None ,
67
80
pdf_dpi : int = 300 ,
68
81
pdf_image_format : str = "png" ,
69
- textract_compatible_formats : Iterable [str ] = ("jpg" , "jpeg" , "png" ),
82
+ allowed_formats : Iterable [str ] = ("jpg" , "jpeg" , "png" ),
70
83
preferred_image_format : str = "png" ,
71
84
) -> List [ImageExtractionResult ]:
72
85
"""Process a mixed PDF/image dataset for use with SageMaker Ground Truth image task UIs
@@ -86,9 +99,12 @@ def clean_dataset_for_img_ocr(
86
99
DPI resolution to extract images from PDFs (Default 300).
87
100
pdf_image_format : str
88
101
Format to extract images from PDFs (Default 'png').
89
- textract_compatible_formats : Iterable[str]
90
- The set of compatible file formats for Textract : Used to determine whether to convert
102
+ allowed_formats : Iterable[str]
103
+ The set of permitted file formats for compatibility : Used to determine whether to convert
91
104
source images in other formats which PIL may still have been able to successfully load.
105
+ NOTE: Amazon Textract also supports 'tiff', but we left it out of the default list because
106
+ TIFF images seemed to break the SageMaker Ground Truth built-in bounding box UI as of some
107
+ tests in 2021-10. Default ('jpg', 'jpeg', 'png').
92
108
preferred_image_format : str
93
109
Format to be used when an image has been saved/converted (Default 'png').
94
110
"""
@@ -104,7 +120,7 @@ def clean_dataset_for_img_ocr(
104
120
(os .path .join (path , f ) for path , _ , files in os .walk (from_path ) for f in files ),
105
121
)
106
122
)
107
- n_files_total = len ( filepaths )
123
+ ORIENTATION_EXIF_ID = get_exif_tag_id_by_name ( "Orientation" )
108
124
os .makedirs (to_path , exist_ok = True )
109
125
110
126
for filepath in tqdm (filepaths , desc = "Processing input files..." , unit = "file" ):
@@ -144,20 +160,27 @@ def clean_dataset_for_img_ocr(
144
160
"\n - " .join (result .cleanpaths ),
145
161
)
146
162
)
147
- else :
148
- try :
149
- image = PIL .Image .open (filepath )
150
- except PIL .UnidentifiedImageError :
151
- logger .warning (f"* Ignoring incompatible file: { filepath } " )
152
- continue
163
+ continue # PDF processed successfully
164
+
165
+ try :
166
+ image = PIL .Image .open (filepath )
167
+ except PIL .UnidentifiedImageError :
168
+ logger .warning (f"* Ignoring incompatible file: { filepath } " )
169
+ continue # Skip file
170
+
171
+ # Some "image" formats (notably TIFF) support multiple pages as "frames":
172
+ n_image_pages = getattr (image , "n_frames" , 1 )
173
+ if n_image_pages > 1 :
174
+ logger .info ("Extracting %s pages from file %s" , n_image_pages , filepath )
175
+ convert_format = not ext_lower in allowed_formats
176
+ outpaths = []
177
+ for ixpage in range (n_image_pages ):
178
+ if n_image_pages > 1 :
179
+ image .seek (ixpage )
153
180
154
181
# Correct orientation from EXIF data:
155
- for orientation in ExifTags .TAGS .keys ():
156
- if ExifTags .TAGS [orientation ] == "Orientation" :
157
- break
158
- exif = dict ((image ._getexif () or {}).items ())
159
- img_orientation = exif .get (orientation )
160
- logger .info ("Image {} has orientation {}" .format (filepath , img_orientation ))
182
+ exif = dict ((image .getexif () or {}).items ())
183
+ img_orientation = exif .get (ORIENTATION_EXIF_ID )
161
184
if img_orientation == 3 :
162
185
image = image .rotate (180 , expand = True )
163
186
rotated = True
@@ -170,21 +193,38 @@ def clean_dataset_for_img_ocr(
170
193
else :
171
194
rotated = False
172
195
173
- if ext_lower not in textract_compatible_formats :
174
- outpath = os .path .join (outfolder , f"{ basename } .{ preferred_image_format } " )
175
- image .save (outpath )
176
- logger .info (f"* Converted image { filepath } to { outpath } " )
177
- elif rotated :
196
+ if n_image_pages == 1 and not (convert_format or rotated ):
197
+ # Special case where image file can just be copied across:
178
198
outpath = os .path .join (outfolder , filename )
179
- image .save (outpath )
180
- logger .info (f"* Rotated image { filepath } to { outpath } " )
199
+ shutil .copy2 (filepath , outpath )
181
200
else :
182
- outpath = os .path .join (outfolder , filename )
201
+ outpath = os .path .join (
202
+ outfolder ,
203
+ "" .join ((
204
+ basename ,
205
+ "-%04i" % (ixpage + 1 ) if n_image_pages > 1 else "" ,
206
+ "." ,
207
+ preferred_image_format if convert_format else ext ,
208
+ )),
209
+ )
210
+ image .save (outpath )
183
211
184
- shutil .copy2 (filepath , outpath )
185
- logger .info (f"* Copied file { filepath } to { outpath } " )
186
- result .cleanpaths = [outpath ]
187
- results .append (result )
212
+ outpaths .append (outpath )
213
+ logger .info (
214
+ "* %s image %s%s (orientation %s) to %s" ,
215
+ "Converted" if convert_format else (
216
+ "Rotated" if rotated else (
217
+ "Extracted" if n_image_pages > 1 else "Copied"
218
+ )
219
+ ),
220
+ filepath ,
221
+ f" page { ixpage + 1 } " if n_image_pages > 1 else "" ,
222
+ img_orientation ,
223
+ outpath ,
224
+ )
225
+
226
+ result .cleanpaths = outpaths
227
+ results .append (result )
188
228
189
229
logger .info ("Done!" )
190
230
return results
0 commit comments