From bc9559525fbe1957a9fb7bea407d7a135c25e2c1 Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Sat, 14 Sep 2024 00:46:43 +0530 Subject: [PATCH 01/10] Python SDK Changes: - Feature added: now specific pages can be processed with the python sdk using "select_pages" param. Incorporates #23, #24 for python sdk - workflow for the above feature: create a new temperory pdf in the tempdir if select_pages is specified and follow the rest of the process as usual and finally map the page number in the formatted markdown to get the actual number instead of index. - raise warning when both select_pages and maintain used. - required adaptations and updates in messages, exceptions, types, processor, utils etc Fixes/improvements: - memory efficient pdf to image conversion, utilizing paths only option to directly get sorted image paths from pdf2image api Misc: - Bump the version tag - documentation updated --- README.md | 19 ++++--- poetry.lock | 20 ++++++- py_zerox/pyzerox/constants/messages.py | 8 +++ py_zerox/pyzerox/core/types.py | 3 +- py_zerox/pyzerox/core/zerox.py | 75 +++++++++++++++++--------- py_zerox/pyzerox/errors/__init__.py | 2 + py_zerox/pyzerox/errors/exceptions.py | 10 ++++ py_zerox/pyzerox/processor/__init__.py | 4 +- py_zerox/pyzerox/processor/pdf.py | 19 +++---- py_zerox/pyzerox/processor/utils.py | 71 +++++++++++++++++++++--- pyproject.toml | 3 +- setup.cfg | 3 +- setup.py | 2 +- 13 files changed, 184 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 450f0912..98230db2 100644 --- a/README.md +++ b/README.md @@ -215,8 +215,13 @@ kwargs = {"vertex_credentials": vertex_credentials} # Define main async entrypoint async def main(): file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported - output_dir = "./output_test" - result = await zerox(file_path=file_path, model=model, output_dir=output_dir,custom_system_prompt=custom_system_prompt, **kwargs) + + ## process only some pages or all + select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed) + + output_dir = "./output_test" ## directory to save the consolidated markdown file + result = await zerox(file_path=file_path, model=model, output_dir=output_dir, + custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs) return result @@ -239,6 +244,7 @@ async def zerox( output_dir: Optional[str] = None, temp_dir: Optional[str] = None, custom_system_prompt: Optional[str] = None, + select_pages: Optional[Union[int, Iterable[int]]] = None, **kwargs ) -> ZeroxOutput: ... @@ -260,10 +266,11 @@ Parameters - **output_dir** (Optional[str], optional): The directory to save the markdown output. Defaults to None. - **temp_dir** (str, optional): - The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted for zerox uses it. + The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted before zerox uses it. - **custom_system_prompt** (str, optional): - The system prompt to use for the model, this overrides the default system prompt of zerox. Defaults to None. - Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning. + The system prompt to use for the model, this overrides the default system prompt of zerox.Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning. Defaults to None. +- **select_pages** (Optional[Union[int, Iterable[int]]], optional): + Pages to process, can be a single page number or an iterable of page numbers, Defaults to None - **kwargs** (dict, optional): Additional keyword arguments to pass to the litellm.completion method. Refer to the LiteLLM Documentation and Completion Input for details. @@ -271,7 +278,7 @@ Parameters Returns - ZeroxOutput: - The markdown content generated by the model. + Contains the markdown content generated by the model and also some metadata (refer below). ### Example Output (Output from "azure/gpt-4o-mini"): diff --git a/poetry.lock b/poetry.lock index 49346fe8..f71d69d7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2277,6 +2277,24 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pypdf2" +version = "3.0.1" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, + {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, +] + +[package.extras] +crypto = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow", "PyCryptodome"] +image = ["Pillow"] + [[package]] name = "pytest" version = "8.3.2" @@ -3400,4 +3418,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "0443709553ceafbe487d4841543bba66800564fc00e46ee868301c8cdcc063ce" +content-hash = "5267971925918bf72ba6e779347ac85deabf1cd5b20a431f9a52aff4f8bf6b64" diff --git a/py_zerox/pyzerox/constants/messages.py b/py_zerox/pyzerox/constants/messages.py index 4dd0b053..e3ca8262 100644 --- a/py_zerox/pyzerox/constants/messages.py +++ b/py_zerox/pyzerox/constants/messages.py @@ -19,6 +19,14 @@ class Messages: Custom system prompt was provided which overrides the default system prompt. We assume that you know what you are doing. """ + MAINTAIN_FORMAT_SELECTED_PAGES_WARNING = """ + The maintain_format flag is set to True in conjunction with select_pages input given. This may result in unexpected behavior. + """ + + PAGE_NUMBER_OUT_OF_BOUND_ERROR = """ + The page number(s) provided is out of bound. Please provide a valid page number(s). + """ + NON_200_RESPONSE = """ Model API returned status code {status_code}: {data} diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py index dedd7f21..ffe251d3 100644 --- a/py_zerox/pyzerox/core/types.py +++ b/py_zerox/pyzerox/core/types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict, Any +from typing import List, Optional, Dict, Any, Union, Iterable from dataclasses import dataclass, field @@ -16,6 +16,7 @@ class ZeroxArgs: output_dir: Optional[str] = None temp_dir: Optional[str] = None custom_system_prompt: Optional[str] = None + select_pages: Optional[Union[int, Iterable[int]]] = None kwargs: Dict[str, Any] = field(default_factory=dict) @dataclass diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index 81b76a26..e7505bce 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -1,10 +1,12 @@ import os import aioshutil as async_shutil import tempfile -from typing import List, Optional +import warnings +from typing import List, Optional, Union, Iterable from datetime import datetime import aiofiles import aiofiles.os as async_os +import asyncio # Package Imports from ..processor import ( @@ -12,9 +14,10 @@ download_file, process_page, process_pages_in_batches, - sorted_nicely, + create_selected_pages_pdf, ) from ..errors import FileUnavailable +from ..constants.messages import Messages from ..models import litellmmodel from .types import Page, ZeroxOutput @@ -28,6 +31,7 @@ async def zerox( output_dir: Optional[str] = None, temp_dir: Optional[str] = None, custom_system_prompt: Optional[str] = None, + select_pages: Optional[Union[int, Iterable[int]]] = None, **kwargs ) -> ZeroxOutput: """ @@ -38,7 +42,7 @@ async def zerox( :type cleanup: bool, optional :param concurrency: The number of concurrent processes to run, defaults to 10 :type concurrency: int, optional - :param file_path: The path to the PDF file to process + :param file_path: The path or URL to the PDF file to process. :type file_path: str, optional :param maintain_format: Whether to maintain the format from the previous page, defaults to False :type maintain_format: bool, optional @@ -50,6 +54,8 @@ async def zerox( :type temp_dir: str, optional :param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None :type custom_system_prompt: str, optional + :param select_pages: Pages to process, can be a single page number or an iterable of page numbers, defaults to None + :type select_pages: int or Iterable[int], optional :param kwargs: Additional keyword arguments to pass to the model.completion -> litellm.completion method. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input :return: The markdown content generated by the model. @@ -65,6 +71,24 @@ async def zerox( # File Path Validators if not file_path: raise FileUnavailable() + + # Create an instance of the litellm model interface + vision_model = litellmmodel(model=model,**kwargs) + + # override the system prompt if a custom prompt is provided + if custom_system_prompt: + vision_model.system_prompt = custom_system_prompt + + # Check if both maintain_format and select_pages are provided + if maintain_format and select_pages is not None: + warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING) + + # If select_pages is a single integer, convert it to a list for consistency + if isinstance(select_pages, int): + select_pages = [select_pages] + + # Sort the pages to maintain consistency + select_pages = sorted(select_pages) # Ensure the output directory exists if output_dir: @@ -91,26 +115,19 @@ async def zerox( local_path = await download_file(file_path=file_path, temp_dir=temp_directory) if not local_path: raise FileUnavailable() - + raw_file_name = os.path.splitext(os.path.basename(local_path))[0] file_name = "".join(c.lower() if c.isalnum() else "_" for c in raw_file_name) + + # create a subset pdf in temp dir with only the requested pages if select_pages is provided + if select_pages is not None: + subset_pdf_create_kwargs = {"original_pdf_path":local_path, "select_pages":select_pages, + "save_directory":temp_directory, "suffix":"_selected_pages"} + local_path = await asyncio.to_thread(create_selected_pages_pdf, + **subset_pdf_create_kwargs) - # Convert the file to a series of images - await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory) - - # Get a list of sorted converted images (alphanumeric human sorting) - images = list(sorted_nicely([ - f"{temp_directory}/{f}" - for f in await async_os.listdir(temp_directory) - if f.endswith(".png") - ])) - - # Create an instance of the litellm model interface - vision_model = litellmmodel(model=model,**kwargs) - - # override the system prompt if a custom prompt is provided - if custom_system_prompt: - vision_model.system_prompt = custom_system_prompt + # Convert the file to a series of images, below function returns a list of image paths in page order + images = await convert_pdf_to_images(local_path=local_path, temp_dir=temp_directory) if maintain_format: for image in images: @@ -155,10 +172,20 @@ async def zerox( # Format JSON response end_time = datetime.now() completion_time = (end_time - start_time).total_seconds() * 1000 - formatted_pages = [ - Page(content=content, page=i + 1, content_length=len(content)) - for i, content in enumerate(aggregated_markdown) - ] + + # Adjusting the formatted_pages logic to account for select_pages to output the correct page numbers + if select_pages is not None: + # Map aggregated markdown to the selected pages + formatted_pages = [ + Page(content=content, page=select_pages[i], content_length=len(content)) + for i, content in enumerate(aggregated_markdown) + ] + else: + # Default behavior when no select_pages is provided + formatted_pages = [ + Page(content=content, page=i + 1, content_length=len(content)) + for i, content in enumerate(aggregated_markdown) + ] return ZeroxOutput( completion_time=completion_time, diff --git a/py_zerox/pyzerox/errors/__init__.py b/py_zerox/pyzerox/errors/__init__.py index 4e50fb06..ad3d25bc 100644 --- a/py_zerox/pyzerox/errors/__init__.py +++ b/py_zerox/pyzerox/errors/__init__.py @@ -1,6 +1,7 @@ from .exceptions import ( NotAVisionModel, ModelAccessError, + PageNumberOutOfBoundError, MissingEnvironmentVariables, ResourceUnreachableException, FileUnavailable, @@ -11,6 +12,7 @@ __all__ = [ "NotAVisionModel", "ModelAccessError", + "PageNumberOutOfBoundError", "MissingEnvironmentVariables", "ResourceUnreachableException", "FileUnavailable", diff --git a/py_zerox/pyzerox/errors/exceptions.py b/py_zerox/pyzerox/errors/exceptions.py index 905c19db..f2a1856c 100644 --- a/py_zerox/pyzerox/errors/exceptions.py +++ b/py_zerox/pyzerox/errors/exceptions.py @@ -35,6 +35,16 @@ def __init__( ): super().__init__(message, extra_info) +class PageNumberOutOfBoundError(CustomException): + """Exception invalid page number(s) provided.""" + + def __init__( + self, + message: str = Messages.PAGE_NUMBER_OUT_OF_BOUND_ERROR, + extra_info: Optional[Dict] = None, + ): + super().__init__(message, extra_info) + class ResourceUnreachableException(CustomException): """Exception raised when a resource is unreachable.""" diff --git a/py_zerox/pyzerox/processor/__init__.py b/py_zerox/pyzerox/processor/__init__.py index 73991350..fe7be127 100644 --- a/py_zerox/pyzerox/processor/__init__.py +++ b/py_zerox/pyzerox/processor/__init__.py @@ -5,7 +5,7 @@ process_pages_in_batches, ) from .text import format_markdown -from .utils import download_file, sorted_nicely +from .utils import download_file, create_selected_pages_pdf __all__ = [ "save_image", @@ -15,5 +15,5 @@ "download_file", "process_page", "process_pages_in_batches", - "sorted_nicely", + "create_selected_pages_pdf", ] diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index 68808eed..c3b3fa63 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -11,27 +11,24 @@ from ..models import litellmmodel -async def convert_pdf_to_images(local_path: str, temp_dir: str): - """Converts a PDF file to a series of images.""" +async def convert_pdf_to_images(local_path: str, temp_dir: str) -> List[str]: + """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order.""" options = { + "pdf_path": local_path, + "output_folder": temp_dir, "dpi": PDFConversionDefaultOptions.DPI, "fmt": PDFConversionDefaultOptions.FORMAT, "size": PDFConversionDefaultOptions.SIZE, "thread_count": PDFConversionDefaultOptions.THREAD_COUNT, "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO, + "paths_only": True, } - file_name = os.path.splitext(os.path.basename(local_path))[0] try: - images = await asyncio.to_thread( - convert_from_path, local_path, **options + image_paths = await asyncio.to_thread( + convert_from_path, **options ) - tasks = [] - for i, image in enumerate(images, start=1): - image_path = os.path.join(temp_dir, f"{file_name}_page_{i}.png") - tasks.append(save_image(image, image_path)) - await asyncio.gather(*tasks) - return images + return image_paths except Exception as err: logging.error(f"Error converting PDF to images: {err}") diff --git a/py_zerox/pyzerox/processor/utils.py b/py_zerox/pyzerox/processor/utils.py index 09c11d56..ae00fa1f 100644 --- a/py_zerox/pyzerox/processor/utils.py +++ b/py_zerox/pyzerox/processor/utils.py @@ -1,12 +1,14 @@ import os import re -from typing import Optional +from typing import Optional, Union, Iterable from urllib.parse import urlparse import aiofiles import aiohttp +from PyPDF2 import PdfReader, PdfWriter +from ..constants.messages import Messages # Package Imports -from ..errors.exceptions import ResourceUnreachableException +from ..errors.exceptions import ResourceUnreachableException, PageNumberOutOfBoundError async def download_file( @@ -42,9 +44,64 @@ def is_valid_url(string: str) -> bool: ] except ValueError: return False + +def create_selected_pages_pdf(original_pdf_path: str, select_pages: Union[int, Iterable[int]], + save_directory: str, suffix: str = "_selected_pages", + sorted_pages: bool = True) -> str: + """ + Creates a new PDF with only the selected pages. + + :param original_pdf_path: Path to the original PDF file. + :type original_pdf_path: str + :param select_pages: A single page number or an iterable of page numbers (1-indexed). + :type select_pages: int or Iterable[int] + :param save_directory: The directory to store the new PDF. + :type save_directory: str + :param suffix: The suffix to add to the new PDF file name, defaults to "_selected_pages". + :type suffix: str, optional + :param sorted_pages: Whether to sort the selected pages, defaults to True. + :type sorted_pages: bool, optional + :return: Path the new PDF file + """ -def sorted_nicely( l ): - """ Sort the given iterable in the way that humans expect -> alphanumerically""" - convert = lambda text: int(text) if text.isdigit() else text - alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] - return sorted(l, key = alphanum_key) + file_name = os.path.splitext(os.path.basename(original_pdf_path))[0] + + # Write the new PDF to a temporary file + selected_pages_pdf_path = os.path.join(save_directory, f"{file_name}{suffix}.pdf") + + # Ensure select_pages is iterable, if not, convert to list + if isinstance(select_pages, int): + select_pages = [select_pages] + + if sorted_pages: + # Sort the pages for consistency + select_pages = sorted(list(select_pages)) + + with open(original_pdf_path, "rb") as orig_pdf, open(selected_pages_pdf_path, "wb") as new_pdf: + + # Read the original PDF + reader = PdfReader(stream=orig_pdf) + total_pages = len(reader.pages) + + # Validate page numbers + invalid_page_numbers = [] + for page in select_pages: + if page < 1 or page > total_pages: + invalid_page_numbers.append(page) + + ## raise error if invalid page numbers + if invalid_page_numbers: + raise PageNumberOutOfBoundError(extra_info={"input_pdf_num_pages":total_pages, + "select_pages": select_pages, + "invalid_page_numbers": invalid_page_numbers}) + + # Create a new PDF writer + writer = PdfWriter(fileobj=new_pdf) + + # Add only the selected pages + for page_number in select_pages: + writer.add_page(reader.pages[page_number - 1]) + + writer.write(stream=new_pdf) + + return selected_pages_pdf_path diff --git a/pyproject.toml b/pyproject.toml index 28334971..b34a194b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "py-zerox" -version = "0.0.4" +version = "0.0.5" description = "ocr documents using vision models from all popular providers like OpenAI, Azure OpenAI, Anthropic, AWS Bedrock etc" authors = ["wizenheimer","pradhyumna85"] license = "MIT" @@ -19,6 +19,7 @@ aiohttp = "^3.9.5" pdf2image = "^1.17.0" litellm = "^1.44.15" aioshutil = "^1.5" +pypdf2 = "^3.0.1" [tool.poetry.scripts] pre-install = "py_zerox.scripts.pre_install:check_and_install" diff --git a/setup.cfg b/setup.cfg index 4a133eee..f8bbc0f3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = py-zerox -version = 0.0.4 +version = 0.0.5 description = ocr documents using vision models from all popular providers like OpenAI, Azure OpenAI, Anthropic, AWS Bedrock etc long_description = file: README.md long_description_content_type = text/markdown @@ -23,6 +23,7 @@ install_requires = pdf2image>=1.17.0 litellm>=1.44.15 aioshutil>=1.5 + PyPDF2>=3.0.1 [options.packages.find] where = py_zerox.pyzerox diff --git a/setup.py b/setup.py index 4841aea2..75b677bc 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ def run(self): cmdclass={ "install": InstallSystemDependencies, }, - version="0.0.4", + version="0.0.5", packages=find_packages(where="py_zerox"), # Specify the root folder package_dir={"": "py_zerox"}, # Map root directory include_package_data=True, From e969d7f85cdf68fb12a462d4bdf9403ca15e5fa2 Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Mon, 16 Sep 2024 00:42:02 +0530 Subject: [PATCH 02/10] Features added on top of PR #39 to resolve #37 - added post_process_function param to override/skip Zerox's default format_markdown post processing on the model's text output. - removed output_dir param and added output_file_path which is more flexible for arbitrary file extensions - page_separator param added (used when writing the consolidated output to the output_file_path --- README.md | 39 ++++++++++++++++++--------- py_zerox/pyzerox/core/types.py | 9 ++++--- py_zerox/pyzerox/core/zerox.py | 45 ++++++++++++++++++------------- py_zerox/pyzerox/processor/pdf.py | 18 ++++++++++--- 4 files changed, 74 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 98230db2..9c0651ca 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,7 @@ Refer to the [LiteLLM Documentation](https://docs.litellm.ai/docs/providers) for ```python from pyzerox import zerox + import os import json import asyncio @@ -200,7 +201,7 @@ file_path = 'path/to/vertex_ai_service_account.json' # Load the JSON file with open(file_path, 'r') as file: - vertex_credentials = json.load(file) + vertex_credentials = json.load(file) # Convert to JSON string vertex_credentials_json = json.dumps(vertex_credentials) @@ -214,15 +215,23 @@ kwargs = {"vertex_credentials": vertex_credentials} # Define main async entrypoint async def main(): - file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported + file_path = "https://omni-demo-data.s3.amazonaws.com/test/cs101.pdf" ## local filepath and file URL supported + + ## process only some pages or all + select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed) + + output_file_path = "output.md" ## filepath to save the consolidated output file (markdown by default). Pass None to skip saving any output file + page_separator = "\n\n" ## The separator to use between pages when writing the output to `output_file_path` - ## process only some pages or all - select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed) + ## function to apply on model's text output (on each page). Function should take input as string and return output also as string. + ## By default uses Zerox's format_markdown function to format text as markdown + # post_process_function = lambda x: x.strip() ## To skip any post processing pass None, which would just keep the raw text output from the model. - output_dir = "./output_test" ## directory to save the consolidated markdown file - result = await zerox(file_path=file_path, model=model, output_dir=output_dir, - custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs) - return result + result = await zerox(file_path = file_path, model = model, output_file_path = output_file_path, + custom_system_prompt = custom_system_prompt, select_pages = select_pages, + # post_process_function = post_process_function, + **kwargs) + return result # run the main function: @@ -241,10 +250,12 @@ async def zerox( file_path: Optional[str] = "", maintain_format: bool = False, model: str = "gpt-4o-mini", - output_dir: Optional[str] = None, + output_file_path: Optional[str] = None, + page_separator: str = "\n\n", temp_dir: Optional[str] = None, custom_system_prompt: Optional[str] = None, select_pages: Optional[Union[int, Iterable[int]]] = None, + post_process_function: Optional[Callable[[str], str]] = format_markdown, **kwargs ) -> ZeroxOutput: ... @@ -263,14 +274,18 @@ Parameters - **model** (str, optional): The model to use for generating completions. Defaults to "gpt-4o-mini". Refer to LiteLLM Providers for the correct model name, as it may differ depending on the provider. -- **output_dir** (Optional[str], optional): - The directory to save the markdown output. Defaults to None. +- **output_file_path** (Optional[str], optional): + The path to save the markdown output (e.g., "output.md"). Any required directories will be created. Defaults to None. +- **page_separator** (str, optional): + The separator to use between pages when writing the output to `output_file_path`. Defaults to "\n\n". - **temp_dir** (str, optional): The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted before zerox uses it. - **custom_system_prompt** (str, optional): The system prompt to use for the model, this overrides the default system prompt of zerox.Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning. Defaults to None. - **select_pages** (Optional[Union[int, Iterable[int]]], optional): Pages to process, can be a single page number or an iterable of page numbers, Defaults to None +- **post_process_function** (Optional[Callable[[str], str]], optional): + A function to post-process the text output from the model for each page. It should take a string as input and return a string as output. Defaults to Zerox's `format_markdown` function, which formats the output in markdown. Pass None to skip post-processing. - **kwargs** (dict, optional): Additional keyword arguments to pass to the litellm.completion method. Refer to the LiteLLM Documentation and Completion Input for details. @@ -278,7 +293,7 @@ Parameters Returns - ZeroxOutput: - Contains the markdown content generated by the model and also some metadata (refer below). + Contains the output content (markdown as default) generated by the model and also some metadata (refer below). ### Example Output (Output from "azure/gpt-4o-mini"): diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py index ffe251d3..038736f0 100644 --- a/py_zerox/pyzerox/core/types.py +++ b/py_zerox/pyzerox/core/types.py @@ -1,4 +1,5 @@ -from typing import List, Optional, Dict, Any, Union, Iterable +from typing import List, Optional, Dict, Any, Union, Iterable, Callable +from ..processor import format_markdown from dataclasses import dataclass, field @@ -12,11 +13,13 @@ class ZeroxArgs: cleanup: bool = True concurrency: int = 10 maintain_format: bool = False - model: str = "gpt-4o-mini", - output_dir: Optional[str] = None + model: str = "gpt-4o-mini" + output_file_path: Optional[str] = None + page_separator: str = "\n\n" temp_dir: Optional[str] = None custom_system_prompt: Optional[str] = None select_pages: Optional[Union[int, Iterable[int]]] = None + post_process_function: Optional[Callable[[str], str]] = format_markdown kwargs: Dict[str, Any] = field(default_factory=dict) @dataclass diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index e7505bce..ee277d65 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -2,7 +2,7 @@ import aioshutil as async_shutil import tempfile import warnings -from typing import List, Optional, Union, Iterable +from typing import List, Optional, Union, Iterable, Callable from datetime import datetime import aiofiles import aiofiles.os as async_os @@ -15,6 +15,7 @@ process_page, process_pages_in_batches, create_selected_pages_pdf, + format_markdown, ) from ..errors import FileUnavailable from ..constants.messages import Messages @@ -28,14 +29,16 @@ async def zerox( file_path: Optional[str] = "", maintain_format: bool = False, model: str = "gpt-4o-mini", - output_dir: Optional[str] = None, + output_file_path: Optional[str] = None, + page_separator: str = "\n\n", temp_dir: Optional[str] = None, custom_system_prompt: Optional[str] = None, select_pages: Optional[Union[int, Iterable[int]]] = None, + post_process_function: Optional[Callable[[str], str]] = format_markdown, **kwargs ) -> ZeroxOutput: """ - API to perform OCR to markdown using Vision models. + API to perform OCR to markdown (default) using Vision models. Please setup the environment variables for the model and model provider before using this API. Refer: https://docs.litellm.ai/docs/providers :param cleanup: Whether to cleanup the temporary files after processing, defaults to True @@ -48,24 +51,28 @@ async def zerox( :type maintain_format: bool, optional :param model: The model to use for generating completions, defaults to "gpt-4o-mini". Note - Refer: https://docs.litellm.ai/docs/providers to pass correct model name as according to provider it might be different from actual name. :type model: str, optional - :param output_dir: The directory to save the markdown output, defaults to None - :type output_dir: str, optional + :param output_file_path: The path to save the output output file (Example "output.md"). Any required directories will be created, defaults to None + :type output_file_path: str, optional :param temp_dir: The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted for zerox uses it. :type temp_dir: str, optional + :param page_separator: The separator to use between pages when writing the output to "output_file_path", defaults to "\n\n" + :type page_separator: str :param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None :type custom_system_prompt: str, optional :param select_pages: Pages to process, can be a single page number or an iterable of page numbers, defaults to None :type select_pages: int or Iterable[int], optional + :param post_process_function: A function to post-process the text output from the model for each page. It should take string as an input and return string as an output, defaults to "format_markdown" function (zerox's default for markdown formatting). Pass None to skip any post processing on the text output of the model. + :type post_process_function: Callable[[str], str], optional :param kwargs: Additional keyword arguments to pass to the model.completion -> litellm.completion method. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input - :return: The markdown content generated by the model. + :return: The content generated by the model after Zerox's postprocessing (if provided). """ input_token_count = 0 output_token_count = 0 prior_page = "" - aggregated_markdown: List[str] = [] + aggregated_output: List[str] = [] start_time = datetime.now() # File Path Validators @@ -90,7 +97,8 @@ async def zerox( # Sort the pages to maintain consistency select_pages = sorted(select_pages) - # Ensure the output directory exists + # Ensure the directory for output_file_path exists + output_dir = os.path.dirname(output_file_path) if output_file_path else None if output_dir: await async_os.makedirs(output_dir, exist_ok=True) @@ -138,10 +146,11 @@ async def zerox( input_token_count, output_token_count, prior_page, + post_process_function, ) if result: - aggregated_markdown.append(result) + aggregated_output.append(result) else: results = await process_pages_in_batches( images, @@ -151,19 +160,19 @@ async def zerox( input_token_count, output_token_count, prior_page, + post_process_function, ) - aggregated_markdown = [result[0] for result in results if isinstance(result[0], str)] + aggregated_output = [result[0] for result in results if isinstance(result[0], str)] ## add token usage input_token_count += sum([result[1] for result in results]) output_token_count += sum([result[2] for result in results]) - # Write the aggregated markdown to a file - if output_dir: - result_file_path = os.path.join(output_dir, f"{file_name}.md") - async with aiofiles.open(result_file_path, "w") as f: - await f.write("\n\n".join(aggregated_markdown)) + # Write the aggregated output to a file + if output_file_path: + async with aiofiles.open(output_file_path, "w") as f: + await f.write(page_separator.join(aggregated_output)) # Cleanup the downloaded PDF file if cleanup and os.path.exists(temp_directory): @@ -175,16 +184,16 @@ async def zerox( # Adjusting the formatted_pages logic to account for select_pages to output the correct page numbers if select_pages is not None: - # Map aggregated markdown to the selected pages + # Map aggregated_output to the selected pages formatted_pages = [ Page(content=content, page=select_pages[i], content_length=len(content)) - for i, content in enumerate(aggregated_markdown) + for i, content in enumerate(aggregated_output) ] else: # Default behavior when no select_pages is provided formatted_pages = [ Page(content=content, page=i + 1, content_length=len(content)) - for i, content in enumerate(aggregated_markdown) + for i, content in enumerate(aggregated_output) ] return ZeroxOutput( diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index c3b3fa63..9f681aa4 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -1,7 +1,7 @@ import logging import os import asyncio -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Callable from pdf2image import convert_from_path # Package Imports @@ -40,6 +40,7 @@ async def process_page( input_token_count: int = 0, output_token_count: int = 0, prior_page: str = "", + post_process_function: Optional[Callable[[str], str]] = format_markdown, semaphore: Optional[asyncio.Semaphore] = None, ) -> Tuple[str, int, int, str]: """Process a single page of a PDF""" @@ -54,6 +55,7 @@ async def process_page( input_token_count, output_token_count, prior_page, + post_process_function, ) image_path = os.path.join(temp_directory, image) @@ -66,12 +68,18 @@ async def process_page( prior_page=prior_page, ) - formatted_markdown = format_markdown(completion.content) + ## post process the completion + if post_process_function: + output_text = post_process_function(completion.content) + else: + ## skip post processing + output_text = completion.content + input_token_count += completion.input_tokens output_token_count += completion.output_tokens - prior_page = formatted_markdown + prior_page = output_text - return formatted_markdown, input_token_count, output_token_count, prior_page + return output_text, input_token_count, output_token_count, prior_page except Exception as error: logging.error(f"{Messages.FAILED_TO_PROCESS_IMAGE} Error:{error}") @@ -86,6 +94,7 @@ async def process_pages_in_batches( input_token_count: int = 0, output_token_count: int = 0, prior_page: str = "", + post_process_function: Optional[Callable[[str], str]] = format_markdown, ): # Create a semaphore to limit the number of concurrent tasks semaphore = asyncio.Semaphore(concurrency) @@ -99,6 +108,7 @@ async def process_pages_in_batches( input_token_count, output_token_count, prior_page, + post_process_function, semaphore, ) for image in images From c96b83be88a6f01a710f7c180d5c1e5011d8c564 Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Wed, 18 Sep 2024 18:12:58 +0530 Subject: [PATCH 03/10] bump the python sdk version tag --- pyproject.toml | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b34a194b..fa76241d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "py-zerox" -version = "0.0.5" +version = "0.0.6" description = "ocr documents using vision models from all popular providers like OpenAI, Azure OpenAI, Anthropic, AWS Bedrock etc" authors = ["wizenheimer","pradhyumna85"] license = "MIT" diff --git a/setup.cfg b/setup.cfg index f8bbc0f3..68ee0bc7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = py-zerox -version = 0.0.5 +version = 0.0.6 description = ocr documents using vision models from all popular providers like OpenAI, Azure OpenAI, Anthropic, AWS Bedrock etc long_description = file: README.md long_description_content_type = text/markdown From 316e8f626df6c50db4cd62956fcc9512477df1b6 Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Thu, 19 Sep 2024 12:12:26 +0530 Subject: [PATCH 04/10] added default page number separator which will use the page numbers --- py_zerox/pyzerox/core/types.py | 2 +- py_zerox/pyzerox/core/zerox.py | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py index 038736f0..62cc1a3b 100644 --- a/py_zerox/pyzerox/core/types.py +++ b/py_zerox/pyzerox/core/types.py @@ -15,7 +15,7 @@ class ZeroxArgs: maintain_format: bool = False model: str = "gpt-4o-mini" output_file_path: Optional[str] = None - page_separator: str = "\n\n" + page_separator: Optional[str] = None temp_dir: Optional[str] = None custom_system_prompt: Optional[str] = None select_pages: Optional[Union[int, Iterable[int]]] = None diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index ee277d65..4b630407 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -30,7 +30,7 @@ async def zerox( maintain_format: bool = False, model: str = "gpt-4o-mini", output_file_path: Optional[str] = None, - page_separator: str = "\n\n", + page_separator: Optional[str] = None, temp_dir: Optional[str] = None, custom_system_prompt: Optional[str] = None, select_pages: Optional[Union[int, Iterable[int]]] = None, @@ -55,8 +55,8 @@ async def zerox( :type output_file_path: str, optional :param temp_dir: The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted for zerox uses it. :type temp_dir: str, optional - :param page_separator: The separator to use between pages when writing the output to "output_file_path", defaults to "\n\n" - :type page_separator: str + :param page_separator: The separator to use between pages (at the end of each page) when writing the output to "output_file_path", can include a {page_no} placeholder to insert the page number. Uses "\\n<=== Page {page_no} ===>\\n" by default. defaults to None + :type page_separator: str, None :param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None :type custom_system_prompt: str, optional :param select_pages: Pages to process, can be a single page number or an iterable of page numbers, defaults to None @@ -171,8 +171,19 @@ async def zerox( # Write the aggregated output to a file if output_file_path: + if not page_separator: + page_separator = "\n<=== Page {page_no} ===>\n" + async with aiofiles.open(output_file_path, "w") as f: - await f.write(page_separator.join(aggregated_output)) + for i, page_content in enumerate(aggregated_output): + await f.write(page_content) + + # Replace {page_no} with the actual page number in page_separator + if "{page_no}" in page_separator: + page_no_text = page_separator.format(page_no=(select_pages[i] if select_pages else i + 1)) + await f.write(f"{page_no_text}") + else: + await f.write(page_separator) # Cleanup the downloaded PDF file if cleanup and os.path.exists(temp_directory): From d79740f64a6bd1f5c04da30e8bdae944ef799fa3 Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Thu, 19 Sep 2024 12:33:57 +0530 Subject: [PATCH 05/10] fix: bump version tag in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 75b677bc..7627d5f1 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ def run(self): cmdclass={ "install": InstallSystemDependencies, }, - version="0.0.5", + version="0.0.6", packages=find_packages(where="py_zerox"), # Specify the root folder package_dir={"": "py_zerox"}, # Map root directory include_package_data=True, From 71d7d1b5616f22eea61ecf27c33f70ee8e2daceb Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Thu, 19 Sep 2024 13:01:36 +0530 Subject: [PATCH 06/10] fix issue after #39: select pages error when select pages is not passed --- py_zerox/pyzerox/core/zerox.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index 4b630407..475a0f5a 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -91,11 +91,12 @@ async def zerox( warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING) # If select_pages is a single integer, convert it to a list for consistency - if isinstance(select_pages, int): - select_pages = [select_pages] - - # Sort the pages to maintain consistency - select_pages = sorted(select_pages) + if select_pages: + if isinstance(select_pages, int): + select_pages = [select_pages] + else: + # Sort the pages to maintain consistency + select_pages = sorted(list(select_pages)) # Ensure the directory for output_file_path exists output_dir = os.path.dirname(output_file_path) if output_file_path else None From 5846b57f95b4a807f33e3db3b6c50d581d4febd1 Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Thu, 19 Sep 2024 13:07:38 +0530 Subject: [PATCH 07/10] better default to prevent mixup in markdown output --- py_zerox/pyzerox/core/zerox.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index 475a0f5a..13b89028 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -55,7 +55,7 @@ async def zerox( :type output_file_path: str, optional :param temp_dir: The directory to store temporary files, defaults to some named folder in system's temp directory. If already exists, the contents will be deleted for zerox uses it. :type temp_dir: str, optional - :param page_separator: The separator to use between pages (at the end of each page) when writing the output to "output_file_path", can include a {page_no} placeholder to insert the page number. Uses "\\n<=== Page {page_no} ===>\\n" by default. defaults to None + :param page_separator: The separator to use between pages (at the end of each page) when writing the output to "output_file_path", can include a {page_no} placeholder to insert the page number. Uses "\\n\\n<=== Page {page_no} ===>\\n\\n" by default. defaults to None :type page_separator: str, None :param custom_system_prompt: The system prompt to use for the model, this overrides the default system prompt of zerox. Generally it is not required unless you want some specific behaviour. When set, it will raise a friendly warning, defaults to None :type custom_system_prompt: str, optional @@ -173,7 +173,7 @@ async def zerox( # Write the aggregated output to a file if output_file_path: if not page_separator: - page_separator = "\n<=== Page {page_no} ===>\n" + page_separator = "\n\n<=== Page {page_no} ===>\n\n" async with aiofiles.open(output_file_path, "w") as f: for i, page_content in enumerate(aggregated_output): From eccc73bee4b6b33e0cf134261696b1cf81e2d09d Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Thu, 19 Sep 2024 23:11:20 +0530 Subject: [PATCH 08/10] minor update --- py_zerox/pyzerox/core/zerox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index 13b89028..0f86f3fd 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -172,7 +172,7 @@ async def zerox( # Write the aggregated output to a file if output_file_path: - if not page_separator: + if not page_separator and not isinstance(page_separator, str): page_separator = "\n\n<=== Page {page_no} ===>\n\n" async with aiofiles.open(output_file_path, "w") as f: From 061147866de20b6cb8b04a155c82493a8d20640a Mon Sep 17 00:00:00 2001 From: Pradyumna Singh Rathore Date: Mon, 21 Oct 2024 19:26:26 +0530 Subject: [PATCH 09/10] add a way to skip given model vision capability as litellm uses a static json with various models to validate vision capability instead of actual test: https://github.com/BerriAI/litellm/blob/fb523b79e9fdd7ce2d3a33f6c57a3679c7249e35/litellm/utils.py#L4974 --- py_zerox/pyzerox/models/modellitellm.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/py_zerox/pyzerox/models/modellitellm.py b/py_zerox/pyzerox/models/modellitellm.py index bda48287..b58f5c9a 100644 --- a/py_zerox/pyzerox/models/modellitellm.py +++ b/py_zerox/pyzerox/models/modellitellm.py @@ -35,7 +35,11 @@ def __init__( ## calling custom methods to validate the environment and model self.validate_environment() - self.validate_model() + + ## way to override vision validation + if self.kwargs.get("validate_vision_capability", True): + self.validate_model() + self.validate_access() @property From 23a7654bce5727aa5926f9f42bba69b0aacc1c8f Mon Sep 17 00:00:00 2001 From: pradhyumna85 Date: Tue, 22 Oct 2024 17:53:43 +0530 Subject: [PATCH 10/10] added way to pass non litellm meta params --- py_zerox/pyzerox/models/modellitellm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/py_zerox/pyzerox/models/modellitellm.py b/py_zerox/pyzerox/models/modellitellm.py index b58f5c9a..b544bb7d 100644 --- a/py_zerox/pyzerox/models/modellitellm.py +++ b/py_zerox/pyzerox/models/modellitellm.py @@ -14,7 +14,6 @@ DEFAULT_SYSTEM_PROMPT = Prompts.DEFAULT_SYSTEM_PROMPT - class litellmmodel(BaseModel): ## setting the default system prompt _system_prompt = DEFAULT_SYSTEM_PROMPT @@ -30,14 +29,22 @@ def __init__( :type model: str, optional :param kwargs: Additional keyword arguments to pass to self.completion -> litellm.completion. Refer: https://docs.litellm.ai/docs/providers and https://docs.litellm.ai/docs/completion/input + + Note: kwargs params starting with "__zxmetaconfig" are treated as meta config params and are not passed to litellm backend. """ super().__init__(model=model, **kwargs) + ## create another dict having the keys starting with "__zxmetaconfig" + self.meta_config = {k: v for k, v in self.kwargs.items() if k.startswith("__zxmetaconfig")} + + ## remove the meta config keys from kwargs + self.kwargs = {k: v for k, v in self.kwargs.items() if not k.startswith("__zxmetaconfig")} + ## calling custom methods to validate the environment and model self.validate_environment() ## way to override vision validation - if self.kwargs.get("validate_vision_capability", True): + if self.meta_config.get("__zxmetaconfig_validate_vision_capability", True): self.validate_model() self.validate_access()