From 4a34b9d3a41ecf7325e5e17f673e49ede1d8ba69 Mon Sep 17 00:00:00 2001 From: Eoin Molloy Date: Fri, 22 Nov 2024 21:17:32 +0000 Subject: [PATCH 1/2] Add functionality to write tables produced in extract_tables to be written to CSV files --- pdf2docx/converter.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py index 337d9b3b..3b286807 100644 --- a/pdf2docx/converter.py +++ b/pdf2docx/converter.py @@ -4,13 +4,15 @@ import os from multiprocessing import Pool, cpu_count from time import perf_counter -from typing import AnyStr, IO, Union +from typing import AnyStr, IO, Union, List import fitz from docx import Document from .page.Page import Page from .page.Pages import Pages +import csv + # check PyMuPDF version # 1.19.0 <= v <= 1.23.8, or v>=1.23.16 @@ -357,13 +359,14 @@ def convert(self, docx_filename: Union[str, IO[AnyStr]] = None, start: int = 0, logging.info('Terminated in %.2fs.', perf_counter()-t0) - def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs): + def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_csv = False, **kwargs): '''Extract table contents from specified PDF pages. Args: start (int, optional): First page to process. Defaults to 0, the first page. end (int, optional): Last page to process. Defaults to None, the last page. pages (list, optional): Range of page indexes. Defaults to None. + write_to_csv (bool, optional): Whether to write each table to its own csv file. kwargs (dict, optional): Configuration parameters. Defaults to None. Returns: @@ -379,9 +382,29 @@ def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs): for page in self._pages: if page.finalized: tables.extend(page.extract_tables(**settings)) + if write_to_csv: + self.table_to_csv(tables) + return tables - + def table_to_csv(self, tables:List, **kwargs): + + '''Write each table to a csv file as generated from extract_tables function + + Args: + tables (list): Tables as returned from extract_tables function + csv_folder (str, optional): folder to write + ''' + + for i, table in enumerate(tables): + output_file = f'table_{i+1}.csv' + + # Open a CSV file for writing + with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + # Write each row of the table to the CSV file + writer.writerows(table) + def _convert_with_multi_processing(self, docx_filename:str, start:int, end:int, **kwargs): '''Parse and create pages based on page indexes with multi-processing. From 25b5ffe1955666679336217bb913458dc6565aaa Mon Sep 17 00:00:00 2001 From: Eoin Molloy Date: Fri, 22 Nov 2024 21:32:26 +0000 Subject: [PATCH 2/2] Add ability to specify folder name of csv files and logging to identify which table is not being converted to csv correctly --- pdf2docx/converter.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py index 3b286807..a923b179 100644 --- a/pdf2docx/converter.py +++ b/pdf2docx/converter.py @@ -359,7 +359,7 @@ def convert(self, docx_filename: Union[str, IO[AnyStr]] = None, start: int = 0, logging.info('Terminated in %.2fs.', perf_counter()-t0) - def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_csv = False, **kwargs): + def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_csv = False, csv_folder: str= '.', **kwargs): '''Extract table contents from specified PDF pages. Args: @@ -367,6 +367,7 @@ def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_cs end (int, optional): Last page to process. Defaults to None, the last page. pages (list, optional): Range of page indexes. Defaults to None. write_to_csv (bool, optional): Whether to write each table to its own csv file. + csv_folder (str, optional): specify folder name to write csvs to kwargs (dict, optional): Configuration parameters. Defaults to None. Returns: @@ -383,27 +384,35 @@ def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_cs if page.finalized: tables.extend(page.extract_tables(**settings)) if write_to_csv: - self.table_to_csv(tables) + self.table_to_csv(tables, csv_folder) return tables - def table_to_csv(self, tables:List, **kwargs): + def table_to_csv(self, tables:List, csv_folder: str= '.', **kwargs): '''Write each table to a csv file as generated from extract_tables function Args: tables (list): Tables as returned from extract_tables function - csv_folder (str, optional): folder to write + csv_folder (str, optional): specify folder name to write csvs to ''' + os.makedirs(csv_folder, exist_ok=True) + + written_files = [] for i, table in enumerate(tables): - output_file = f'table_{i+1}.csv' + + output_file = os.path.join(csv_folder, f'table_{i+1}.csv') # Open a CSV file for writing - with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write each row of the table to the CSV file - writer.writerows(table) + try: + with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(table) # Write rows of the table + written_files.append(output_file) + logging.info(f"Table {i+1} written to {output_file}") + except Exception as e: + logging.error(f"Failed to write table {i+1} to {output_file}: {e}") def _convert_with_multi_processing(self, docx_filename:str, start:int, end:int, **kwargs): '''Parse and create pages based on page indexes with multi-processing.