22import logging
33from ctypes import c_char_p , c_ushort
44from threading import RLock
5- from typing import BinaryIO , List , Optional , Union
5+ from typing import BinaryIO , List , Optional , Tuple , Union
66
77import pypdfium2 as pdfium
88import pypdfium2 .raw as pdfium_c
99from _ctypes import POINTER
10+ from PIL import Image
1011
1112from mindee .image_operations .image_compressor import compress_image
1213from mindee .pdf .pdf_char_data import PDFCharData
1314from mindee .pdf .pdf_utils import (
14- attach_images_as_new_file ,
1515 extract_text_from_pdf ,
1616 has_source_text ,
1717)
@@ -61,19 +61,22 @@ def compress_pdf(
6161 extract_text_from_pdf (pdf_bytes ) if not disable_source_text else None
6262 )
6363
64- compressed_pages = compress_pdf_pages (
65- pdf_bytes , extracted_text , image_quality , disable_source_text
66- )
64+ compressed_pages = compress_pdf_pages (pdf_bytes , image_quality )
6765
6866 if not compressed_pages :
6967 logger .warning (
7068 "Could not compress PDF to a smaller size. Returning original PDF."
7169 )
7270 return pdf_bytes
7371
74- out_pdf = attach_images_as_new_file (
75- [io . BytesIO ( compressed_page ) for compressed_page in compressed_pages ]
72+ out_pdf = collect_images_as_pdf (
73+ [compressed_page_image [ 0 ] for compressed_page_image in compressed_pages ]
7674 )
75+
76+ if not disable_source_text :
77+ for i , page in enumerate (out_pdf ):
78+ add_text_to_pdf_page (page , i , extracted_text )
79+
7780 out_buffer = io .BytesIO ()
7881 out_pdf .save (out_buffer )
7982 out_buffer .seek (0 )
@@ -82,26 +85,20 @@ def compress_pdf(
8285
8386def compress_pdf_pages (
8487 pdf_data : bytes ,
85- extracted_text : Optional [List [PDFCharData ]],
8688 image_quality : int ,
87- disable_source_text : bool ,
88- ) -> Optional [List [bytes ]]:
89+ ) -> Optional [List [Tuple [bytes , int , int ]]]:
8990 """
9091 Compresses PDF pages and returns an array of compressed page buffers.
9192
9293 :param pdf_data: The input PDF as bytes.
93- :param extracted_text: Extracted text from the PDF.
9494 :param image_quality: Initial compression quality.
95- :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
9695 :return: List of compressed page buffers, or None if compression fails.
9796 """
9897 original_size = len (pdf_data )
9998 image_quality_loop = image_quality
10099
101100 while image_quality_loop >= MIN_QUALITY :
102- compressed_pages = compress_pages_with_quality (
103- pdf_data , extracted_text , image_quality_loop , disable_source_text
104- )
101+ compressed_pages = compress_pages_with_quality (pdf_data , image_quality_loop )
105102 total_compressed_size = sum (len (page ) for page in compressed_pages )
106103
107104 if is_compression_successful (
@@ -115,28 +112,28 @@ def compress_pdf_pages(
115112
116113
117114def add_text_to_pdf_page ( # type: ignore
118- document : pdfium .PdfDocument ,
115+ page : pdfium .PdfPage ,
119116 page_id : int ,
120- extracted_text : Optional [List [PDFCharData ]],
117+ extracted_text : Optional [List [List [ PDFCharData ] ]],
121118) -> None :
122119 """
123120 Adds text to a PDF page based on the extracted text data.
124121
125- :param document : The PDFDocument object.
126- :param page_id: ID of the current page.
122+ :param page : The PDFDocument object.
123+ :param page_id: The ID of the page.
127124 :param extracted_text: List of PDFCharData objects containing text and positioning information.
128125 """
129- if not extracted_text :
126+ if not extracted_text or not extracted_text [ page_id ] :
130127 return
131128
132- height = document [ page_id ] .get_height ()
129+ height = page .get_height ()
133130 pdfium_lock = RLock ()
134131
135132 with pdfium_lock :
136- for char_data in extracted_text :
133+ for char_data in extracted_text [ page_id ] :
137134 font_name = c_char_p (char_data .font_name .encode ("utf-8" ))
138135 text_handler = pdfium_c .FPDFPageObj_NewTextObj (
139- document .raw , font_name , char_data .font_size
136+ page . pdf .raw , font_name , char_data .font_size
140137 )
141138 char_code = ord (char_data .char )
142139 char_code_c_char = c_ushort (char_code )
@@ -145,38 +142,28 @@ def add_text_to_pdf_page( # type: ignore
145142 pdfium_c .FPDFPageObj_Transform (
146143 text_handler , 1 , 0 , 0 , 1 , char_data .left , height - char_data .top
147144 )
148- pdfium_c .FPDFPage_InsertObject (document [page_id ].raw , text_handler )
149- pdfium_c .FPDFPageObj_Destroy (text_handler )
150- pdfium_c .FPDFPage_GenerateContent (document [page_id ].raw )
151- pdfium_c .FPDF_ClosePage (document [page_id ].raw )
145+ pdfium_c .FPDFPage_InsertObject (page .raw , text_handler )
146+ pdfium_c .FPDFPage_GenerateContent (page .raw )
152147
153148
154149def compress_pages_with_quality (
155150 pdf_data : bytes ,
156- extracted_text : Optional [list [PDFCharData ]],
157151 image_quality : int ,
158- disable_source_text : bool ,
159- ) -> List [bytes ]:
152+ ) -> List [Tuple [bytes , int , int ]]:
160153 """
161154 Compresses pages with a specific quality.
162155
163156 :param pdf_data: The input PDF as bytes.
164- :param extracted_text: Extracted text from the PDF.
165157 :param image_quality: Compression quality.
166- :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
167158 :return: List of compressed page buffers.
168159 """
169160 pdf_document = pdfium .PdfDocument (pdf_data )
170161 compressed_pages = []
171-
172- for [i , page ] in enumerate (pdf_document ):
162+ for page in pdf_document :
173163 rasterized_page = rasterize_page (page , image_quality )
174164 compressed_image = compress_image (rasterized_page , image_quality )
175-
176- if not disable_source_text :
177- add_text_to_pdf_page (pdf_document , i , extracted_text )
178-
179- compressed_pages .append (compressed_image )
165+ image = Image .open (io .BytesIO (compressed_image ))
166+ compressed_pages .append ((compressed_image , image .size [0 ], image .size [1 ]))
180167
181168 return compressed_pages
182169
@@ -223,3 +210,33 @@ def lerp(start: float, end: float, t: float) -> float:
223210 :return: The interpolated value.
224211 """
225212 return start * (1 - t ) + end * t
213+
214+
215+ def collect_images_as_pdf (image_list : List [bytes ]) -> pdfium .PdfDocument : # type: ignore
216+ """
217+ Converts a list of JPEG images into pages in a PdfDocument.
218+
219+ :param image_list: A list of bytes representing JPEG images.
220+ :return: A PdfDocument handle containing the images as pages.
221+ """
222+ # Create a new, empty PdfDocument
223+ out_pdf = pdfium .PdfDocument .new ()
224+
225+ for image_bytes in image_list :
226+ # Load the JPEG image into a PdfImage object
227+ pdf_image = pdfium .PdfImage .new (out_pdf )
228+ pdf_image .load_jpeg (io .BytesIO (image_bytes ))
229+
230+ # Get the dimensions of the image
231+ width , height = pdf_image .get_size ()
232+
233+ # Create a new page in the PDF with the same dimensions as the image
234+ page = out_pdf .new_page (width , height )
235+
236+ # Place the image on the page
237+ page .insert_obj (pdf_image )
238+
239+ # Generate content for the page to finalize it
240+ page .gen_content ()
241+
242+ return out_pdf
0 commit comments