Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions pdf2docx/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import os
from multiprocessing import Pool, cpu_count
from time import perf_counter
from typing import AnyStr, IO, Union
from typing import AnyStr, IO, Union, List

import fitz
from docx import Document

from .page.Page import Page
from .page.Pages import Pages
import csv


# check PyMuPDF version
# 1.19.0 <= v <= 1.23.8, or v>=1.23.16
Expand Down Expand Up @@ -357,13 +359,15 @@ def convert(self, docx_filename: Union[str, IO[AnyStr]] = None, start: int = 0,
logging.info('Terminated in %.2fs.', perf_counter()-t0)


def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs):
def extract_tables(self, start:int=0, end:int=None, pages:list=None, write_to_csv = False, csv_folder: str= '.', **kwargs):
'''Extract table contents from specified PDF pages.

Args:
start (int, optional): First page to process. Defaults to 0, the first page.
end (int, optional): Last page to process. Defaults to None, the last page.
pages (list, optional): Range of page indexes. Defaults to None.
write_to_csv (bool, optional): Whether to write each table to its own csv file.
csv_folder (str, optional): specify folder name to write csvs to
kwargs (dict, optional): Configuration parameters. Defaults to None.

Returns:
Expand All @@ -379,9 +383,37 @@ def extract_tables(self, start:int=0, end:int=None, pages:list=None, **kwargs):
for page in self._pages:
if page.finalized: tables.extend(page.extract_tables(**settings))

if write_to_csv:
self.table_to_csv(tables, csv_folder)

return tables


def table_to_csv(self, tables:List, csv_folder: str= '.', **kwargs):

'''Write each table to a csv file as generated from extract_tables function

Args:
tables (list): Tables as returned from extract_tables function
csv_folder (str, optional): specify folder name to write csvs to
'''

os.makedirs(csv_folder, exist_ok=True)

written_files = []
for i, table in enumerate(tables):

output_file = os.path.join(csv_folder, f'table_{i+1}.csv')

# Open a CSV file for writing
try:
with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(table) # Write rows of the table
written_files.append(output_file)
logging.info(f"Table {i+1} written to {output_file}")
except Exception as e:
logging.error(f"Failed to write table {i+1} to {output_file}: {e}")

def _convert_with_multi_processing(self, docx_filename:str, start:int, end:int, **kwargs):
'''Parse and create pages based on page indexes with multi-processing.

Expand Down