Improve pypdfium2 integration

mara004 · mara004 · commit f3203cf1f63b · 2025-09-26T14:14:03.000+02:00
diff --git a/paddlex/inference/models/formula_recognition/result.py b/paddlex/inference/models/formula_recognition/result.py
@@ -283,10 +283,7 @@ def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
         for page in pdfDoc:
             rotate = int(0)
             zoom = 2
-            img = page.render(scale=zoom, rotation=rotate).to_pil()
-            img = img.convert("RGB")
-            img = np.array(img)
-            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+            img = page.render(scale=zoom, rotation=rotate).to_numpy()
             xywh = crop_white_area(img)
 
             if xywh is not None:
diff --git a/paddlex/inference/serving/infra/utils.py b/paddlex/inference/serving/infra/utils.py
@@ -195,10 +195,7 @@ def read_pdf(
                 # TODO: Do not always use zoom=2.0
                 zoom = 2.0
                 deg = 0
-                image = page.render(scale=zoom, rotation=deg).to_pil()
-                image = image.convert("RGB")
-                image = np.array(image)
-                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+                image = page.render(scale=zoom, rotation=deg).to_numpy()
                 images.append(image)
                 page_info = PDFPageInfo(
                     width=image.shape[1],
diff --git a/paddlex/inference/utils/io/readers.py b/paddlex/inference/utils/io/readers.py
@@ -293,11 +293,7 @@ def read_file(self, in_path):
         doc = pdfium.PdfDocument(in_path)
         try:
             for page in doc:
-                image = page.render(scale=self._scale, rotation=self._rotation).to_pil()
-                image = image.convert("RGB")
-                img_cv = np.array(image)
-                img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
-                yield img_cv
+                yield page.render(scale=self._scale, rotation=self._rotation).to_numpy()
         finally:
             doc.close()