ArtifexSoftware · emolloy123 · Nov 22, 2024 · Nov 22, 2024
diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py
@@ -4,13 +4,15 @@
 import os
 from multiprocessing import Pool, cpu_count
 from time import perf_counter
-from typing import AnyStr, IO, Union
+from typing import AnyStr, IO, Union, List
 
 import fitz
 from docx import Document
 
 from .page.Page import Page
 from .page.Pages import Pages
+import csv
+
 
 # check PyMuPDF version
 # 1.19.0 <= v <= 1.23.8, or v>=1.23.16
@@ -357,13 +359,15 @@ def convert(self, docx_filename: Union[str, IO[AnyStr]] = None, start: int = 0,
         logging.info('Terminated in %.2fs.', perf_counter()-t0)        
 
 
-    def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs):
+    def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_csv = False, csv_folder: str= '.', **kwargs):
         '''Extract table contents from specified PDF pages.
 
         Args:
             start (int, optional): First page to process. Defaults to 0, the first page.
             end (int, optional): Last page to process. Defaults to None, the last page.
             pages (list, optional): Range of page indexes. Defaults to None.
+            write_to_csv (bool, optional): Whether to write each table to its own csv file.
+            csv_folder (str, optional): specify folder name to write csvs to
             kwargs (dict, optional): Configuration parameters. Defaults to None.
 
         Returns:
@@ -379,9 +383,37 @@ def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs):
         for page in self._pages:
             if page.finalized: tables.extend(page.extract_tables(**settings))
 
+        if write_to_csv:
+            self.table_to_csv(tables, csv_folder)
+
         return tables
 
-
+    def table_to_csv(self, tables:List, csv_folder: str= '.', **kwargs):
+
+        '''Write each table to a csv file as generated from extract_tables function
+
+        Args:
+            tables (list): Tables as returned from extract_tables function
+            csv_folder (str, optional): specify folder name to write csvs to
+        '''
+
+        os.makedirs(csv_folder, exist_ok=True) 
+
+        written_files = []
+        for i, table in enumerate(tables):
+
+            output_file = os.path.join(csv_folder, f'table_{i+1}.csv')
+
+            # Open a CSV file for writing
+            try:
+                with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
+                    writer = csv.writer(csvfile)
+                    writer.writerows(table)  # Write rows of the table
+                written_files.append(output_file)
+                logging.info(f"Table {i+1} written to {output_file}")
+            except Exception as e:
+                logging.error(f"Failed to write table {i+1} to {output_file}: {e}")
+
     def _convert_with_multi_processing(self, docx_filename:str, start:int, end:int, **kwargs):
         '''Parse and create pages based on page indexes with multi-processing.