From 4a34b9d3a41ecf7325e5e17f673e49ede1d8ba69 Mon Sep 17 00:00:00 2001
From: Eoin Molloy <eoinmolloy@Eoins-MacBook-Air.local>
Date: Fri, 22 Nov 2024 21:17:32 +0000
Subject: [PATCH 1/2] Add functionality to write tables produced in
 extract_tables to be written to CSV files

---
 pdf2docx/converter.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py
index 337d9b3b..3b286807 100644
--- a/pdf2docx/converter.py
+++ b/pdf2docx/converter.py
@@ -4,13 +4,15 @@
 import os
 from multiprocessing import Pool, cpu_count
 from time import perf_counter
-from typing import AnyStr, IO, Union
+from typing import AnyStr, IO, Union, List
 
 import fitz
 from docx import Document
 
 from .page.Page import Page
 from .page.Pages import Pages
+import csv
+
 
 # check PyMuPDF version
 # 1.19.0 <= v <= 1.23.8, or v>=1.23.16
@@ -357,13 +359,14 @@ def convert(self, docx_filename: Union[str, IO[AnyStr]] = None, start: int = 0,
         logging.info('Terminated in %.2fs.', perf_counter()-t0)        
 
 
-    def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs):
+    def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_csv = False, **kwargs):
         '''Extract table contents from specified PDF pages.
 
         Args:
             start (int, optional): First page to process. Defaults to 0, the first page.
             end (int, optional): Last page to process. Defaults to None, the last page.
             pages (list, optional): Range of page indexes. Defaults to None.
+            write_to_csv (bool, optional): Whether to write each table to its own csv file.
             kwargs (dict, optional): Configuration parameters. Defaults to None.
         
         Returns:
@@ -379,9 +382,29 @@ def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs):
         for page in self._pages:
             if page.finalized: tables.extend(page.extract_tables(**settings))
 
+        if write_to_csv:
+            self.table_to_csv(tables)
+            
         return tables
 
-    
+    def table_to_csv(self, tables:List, **kwargs):
+
+        '''Write each table to a csv file as generated from extract_tables function
+        
+        Args:
+            tables (list): Tables as returned from extract_tables function
+            csv_folder (str, optional): folder to write
+        '''
+
+        for i, table in enumerate(tables):
+            output_file = f'table_{i+1}.csv'
+            
+            # Open a CSV file for writing
+            with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
+                writer = csv.writer(csvfile)
+                # Write each row of the table to the CSV file
+                writer.writerows(table)
+                
     def _convert_with_multi_processing(self, docx_filename:str, start:int, end:int, **kwargs):
         '''Parse and create pages based on page indexes with multi-processing.
 

From 25b5ffe1955666679336217bb913458dc6565aaa Mon Sep 17 00:00:00 2001
From: Eoin Molloy <eoinmolloy@Eoins-MacBook-Air.local>
Date: Fri, 22 Nov 2024 21:32:26 +0000
Subject: [PATCH 2/2] Add ability to specify folder name of csv files and
 logging to identify which table is not being converted to csv correctly

---
 pdf2docx/converter.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py
index 3b286807..a923b179 100644
--- a/pdf2docx/converter.py
+++ b/pdf2docx/converter.py
@@ -359,7 +359,7 @@ def convert(self, docx_filename: Union[str, IO[AnyStr]] = None, start: int = 0,
         logging.info('Terminated in %.2fs.', perf_counter()-t0)        
 
 
-    def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_csv = False, **kwargs):
+    def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_csv = False, csv_folder: str= '.', **kwargs):
         '''Extract table contents from specified PDF pages.
 
         Args:
@@ -367,6 +367,7 @@ def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_cs
             end (int, optional): Last page to process. Defaults to None, the last page.
             pages (list, optional): Range of page indexes. Defaults to None.
             write_to_csv (bool, optional): Whether to write each table to its own csv file.
+            csv_folder (str, optional): specify folder name to write csvs to
             kwargs (dict, optional): Configuration parameters. Defaults to None.
         
         Returns:
@@ -383,27 +384,35 @@ def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_cs
             if page.finalized: tables.extend(page.extract_tables(**settings))
 
         if write_to_csv:
-            self.table_to_csv(tables)
+            self.table_to_csv(tables, csv_folder)
             
         return tables
 
-    def table_to_csv(self, tables:List, **kwargs):
+    def table_to_csv(self, tables:List, csv_folder: str= '.', **kwargs):
 
         '''Write each table to a csv file as generated from extract_tables function
         
         Args:
             tables (list): Tables as returned from extract_tables function
-            csv_folder (str, optional): folder to write
+            csv_folder (str, optional): specify folder name to write csvs to
         '''
 
+        os.makedirs(csv_folder, exist_ok=True) 
+
+        written_files = []
         for i, table in enumerate(tables):
-            output_file = f'table_{i+1}.csv'
+            
+            output_file = os.path.join(csv_folder, f'table_{i+1}.csv')
             
             # Open a CSV file for writing
-            with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
-                writer = csv.writer(csvfile)
-                # Write each row of the table to the CSV file
-                writer.writerows(table)
+            try:
+                with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
+                    writer = csv.writer(csvfile)
+                    writer.writerows(table)  # Write rows of the table
+                written_files.append(output_file)
+                logging.info(f"Table {i+1} written to {output_file}")
+            except Exception as e:
+                logging.error(f"Failed to write table {i+1} to {output_file}: {e}")
                 
     def _convert_with_multi_processing(self, docx_filename:str, start:int, end:int, **kwargs):
         '''Parse and create pages based on page indexes with multi-processing.