Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/extras/code_samples/default_v2.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from mindee import ClientV2, InferencePredictOptions
from mindee import ClientV2, InferenceParameters

input_path = "/path/to/the/file.ext"
api_key = "MY_API_KEY"
Expand All @@ -7,20 +7,20 @@ model_id = "MY_MODEL_ID"
# Init a new client
mindee_client = ClientV2(api_key)

# Set inference options
options = InferencePredictOptions(
# Set inference paramters
params = InferenceParameters(
# ID of the model, required.
model_id=model_id,
# If set to `True`, will enable Retrieval-Augmented Generation.
rag=False,
)

# Load a file from disk
input_doc = mindee_client.source_from_path(input_path)
input_source = mindee_client.source_from_path(input_path)

# Upload the file
response = mindee_client.enqueue_and_parse(
input_doc, options
input_source, params
)

# Print a brief summary of the parsed data
Expand Down
22 changes: 21 additions & 1 deletion mindee/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
from mindee import product
from mindee.client import Client
from mindee.client_v2 import ClientV2
from mindee.input.inference_predict_options import InferencePredictOptions
from mindee.input.inference_parameters import InferenceParameters
from mindee.input.local_response import LocalResponse
from mindee.input.page_options import PageOptions
from mindee.input.polling_options import PollingOptions
from mindee.parsing.common.api_response import ApiResponse
from mindee.parsing.common.async_predict_response import AsyncPredictResponse
from mindee.parsing.common.feedback_response import FeedbackResponse
from mindee.parsing.common.job import Job
from mindee.parsing.common.predict_response import PredictResponse
from mindee.parsing.common.workflow_response import WorkflowResponse
from mindee.parsing.v2.inference_response import InferenceResponse
from mindee.parsing.v2.job_response import JobResponse

__all__ = [
"Client",
"ClientV2",
"InferenceParameters",
"LocalResponse",
"PageOptions",
"PollingOptions",
"ApiResponse",
"AsyncPredictResponse",
"FeedbackResponse",
"PredictResponse",
"WorkflowResponse",
"JobResponse",
"InferenceResponse",
"product",
]
39 changes: 16 additions & 23 deletions mindee/client_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from mindee.client_mixin import ClientMixin
from mindee.error.mindee_error import MindeeError
from mindee.error.mindee_http_error_v2 import handle_error_v2
from mindee.input.inference_predict_options import InferencePredictOptions
from mindee.input.inference_parameters import InferenceParameters
from mindee.input.local_response import LocalResponse
from mindee.input.polling_options import PollingOptions
from mindee.input.sources.local_input_source import LocalInputSource
Expand Down Expand Up @@ -38,28 +38,21 @@ def __init__(self, api_key: Optional[str] = None) -> None:
self.mindee_api = MindeeApiV2(api_key)

def enqueue(
self, input_source: LocalInputSource, options: InferencePredictOptions
self, input_source: LocalInputSource, params: InferenceParameters
) -> JobResponse:
"""
Enqueues a document to a given model.

:param input_source: The document/source file to use.
Has to be created beforehand.

:param options: Options for the prediction.
:param params: Parameters to set when sending a file.
:return: A valid inference response.
"""
logger.debug("Enqueuing document to '%s'", options.model_id)

if options.page_options and input_source.is_pdf():
input_source.process_pdf(
options.page_options.operation,
options.page_options.on_min_pages,
options.page_options.page_indexes,
)
logger.debug("Enqueuing document to '%s'", params.model_id)

response = self.mindee_api.predict_async_req_post(
input_source=input_source, options=options
input_source=input_source, options=params
)
dict_response = response.json()

Expand Down Expand Up @@ -89,35 +82,35 @@ def parse_queued(
return InferenceResponse(dict_response)

def enqueue_and_parse(
self, input_source: LocalInputSource, options: InferencePredictOptions
self, input_source: LocalInputSource, params: InferenceParameters
) -> InferenceResponse:
"""
Enqueues to an asynchronous endpoint and automatically polls for a response.

:param input_source: The document/source file to use.
Has to be created beforehand.

:param options: Options for the prediction.
:param params: Parameters to set when sending a file.

:return: A valid inference response.
"""
if not options.polling_options:
options.polling_options = PollingOptions()
if not params.polling_options:
params.polling_options = PollingOptions()
self._validate_async_params(
options.polling_options.initial_delay_sec,
options.polling_options.delay_sec,
options.polling_options.max_retries,
params.polling_options.initial_delay_sec,
params.polling_options.delay_sec,
params.polling_options.max_retries,
)
queue_result = self.enqueue(input_source, options)
queue_result = self.enqueue(input_source, params)
logger.debug(
"Successfully enqueued document with job id: %s", queue_result.job.id
)
sleep(options.polling_options.initial_delay_sec)
sleep(params.polling_options.initial_delay_sec)
retry_counter = 1
poll_results = self.parse_queued(
queue_result.job.id,
)
while retry_counter < options.polling_options.max_retries:
while retry_counter < params.polling_options.max_retries:
if not isinstance(poll_results, JobResponse):
break
if poll_results.job.status == "Failed":
Expand All @@ -133,7 +126,7 @@ def enqueue_and_parse(
queue_result.job.id,
)
retry_counter += 1
sleep(options.polling_options.delay_sec)
sleep(params.polling_options.delay_sec)
poll_results = self.parse_queued(queue_result.job.id)

if not isinstance(poll_results, InferenceResponse):
Expand Down
1 change: 0 additions & 1 deletion mindee/input/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from mindee.input.inference_predict_options import InferencePredictOptions
from mindee.input.local_response import LocalResponse
from mindee.input.page_options import PageOptions
from mindee.input.polling_options import PollingOptions
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from dataclasses import dataclass
from typing import List, Optional

from mindee.input.page_options import PageOptions
from mindee.input.polling_options import PollingOptions


@dataclass
class InferencePredictOptions:
"""Inference prediction options."""
class InferenceParameters:
"""Inference parameters to set when sending a file."""

model_id: str
"""ID of the model, required."""
Expand All @@ -17,9 +16,7 @@ class InferencePredictOptions:
"""Optional alias for the file."""
webhook_ids: Optional[List[str]] = None
"""IDs of webhooks to propagate the API response to."""
page_options: Optional[PageOptions] = None
"""Options for page-level inference."""
polling_options: Optional[PollingOptions] = None
"""Options for polling."""
"""Options for polling. Set only if having timeout issues."""
close_file: bool = True
"""Whether to close the file after parsing."""
4 changes: 2 additions & 2 deletions mindee/input/polling_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ class PollingOptions:
initial_delay_sec: float
"""Initial delay before the first polling attempt."""
delay_sec: float
"""Delay between each polling attempts."""
"""Delay between each polling attempt."""
max_retries: int
"""Total amount of polling attempts."""
"""Total number of polling attempts."""

def __init__(
self,
Expand Down
12 changes: 11 additions & 1 deletion mindee/input/sources/local_input_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from mindee.error.mimetype_error import MimeTypeError
from mindee.error.mindee_error import MindeeError, MindeeSourceError
from mindee.image_operations.image_compressor import compress_image
from mindee.input.page_options import KEEP_ONLY, REMOVE
from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
from mindee.input.sources.input_type import InputType
from mindee.logger import logger
from mindee.pdf.pdf_compressor import compress_pdf
Expand Down Expand Up @@ -112,6 +112,16 @@ def count_doc_pages(self) -> int:
return len(pdf)
return 1

def apply_page_options(self, page_options: PageOptions) -> None:
"""Apply cut and merge options on multipage documents."""
if not self.is_pdf():
raise MindeeSourceError(f"File is not a PDF: {self.filename}")
self.process_pdf(
page_options.operation,
page_options.on_min_pages,
page_options.page_indexes,
)

def process_pdf(
self,
behavior: str,
Expand Down
4 changes: 2 additions & 2 deletions mindee/mindee_http/mindee_api_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from mindee.error.mindee_error import MindeeApiV2Error
from mindee.input import LocalInputSource
from mindee.input.inference_predict_options import InferencePredictOptions
from mindee.input.inference_parameters import InferenceParameters
from mindee.logger import logger
from mindee.mindee_http.base_settings import USER_AGENT
from mindee.mindee_http.settings_mixin import SettingsMixin
Expand Down Expand Up @@ -68,7 +68,7 @@ def set_from_env(self) -> None:
logger.debug("Value was set from env: %s", name)

def predict_async_req_post(
self, input_source: LocalInputSource, options: InferencePredictOptions
self, input_source: LocalInputSource, options: InferenceParameters
) -> requests.Response:
"""
Make an asynchronous request to POST a document for prediction on the V2 API.
Expand Down
10 changes: 4 additions & 6 deletions tests/test_client_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from mindee import ClientV2, InferencePredictOptions, LocalResponse
from mindee import ClientV2, InferenceParameters, LocalResponse
from mindee.error.mindee_error import MindeeApiV2Error
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
from mindee.input import LocalInputSource, PathInput
Expand Down Expand Up @@ -96,9 +96,7 @@ def test_enqueue_path_with_env_token(custom_base_url_client):
f"{FILE_TYPES_DIR}/receipt.jpg"
)
with pytest.raises(MindeeHTTPErrorV2):
custom_base_url_client.enqueue(
input_doc, InferencePredictOptions("dummy-model")
)
custom_base_url_client.enqueue(input_doc, InferenceParameters("dummy-model"))


@pytest.mark.v2
Expand All @@ -108,7 +106,7 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client):
)
with pytest.raises(MindeeHTTPErrorV2):
custom_base_url_client.enqueue_and_parse(
input_doc, InferencePredictOptions("dummy-model")
input_doc, InferenceParameters("dummy-model")
)


Expand All @@ -128,7 +126,7 @@ def test_error_handling(custom_base_url_client):
PathInput(
V2_DATA_DIR / "products" / "financial_document" / "default_sample.jpg"
),
InferencePredictOptions("dummy-model"),
InferenceParameters("dummy-model"),
)
assert e.status_code == -1
assert e.detail == "forced failure from test"
Expand Down
8 changes: 4 additions & 4 deletions tests/test_client_v2_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from mindee import ClientV2, InferencePredictOptions
from mindee import ClientV2, InferenceParameters
from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2
from mindee.parsing.v2.inference_response import InferenceResponse
from tests.test_inputs import FILE_TYPES_DIR, PRODUCT_DATA_DIR
Expand Down Expand Up @@ -40,7 +40,7 @@ def test_parse_file_empty_multiple_pages_must_succeed(
assert input_path.exists(), f"sample file missing: {input_path}"

input_doc = v2_client.source_from_path(input_path)
options = InferencePredictOptions(findoc_model_id)
options = InferenceParameters(findoc_model_id)

response: InferenceResponse = v2_client.enqueue_and_parse(input_doc, options)

Expand All @@ -66,7 +66,7 @@ def test_parse_file_filled_single_page_must_succeed(
assert input_path.exists(), f"sample file missing: {input_path}"

input_doc = v2_client.source_from_path(input_path)
options = InferencePredictOptions(findoc_model_id)
options = InferenceParameters(findoc_model_id)

response: InferenceResponse = v2_client.enqueue_and_parse(input_doc, options)

Expand Down Expand Up @@ -95,7 +95,7 @@ def test_invalid_uuid_must_throw_error_422(v2_client: ClientV2) -> None:
assert input_path.exists()

input_doc = v2_client.source_from_path(input_path)
options = InferencePredictOptions("INVALID MODEL ID")
options = InferenceParameters("INVALID MODEL ID")

with pytest.raises(MindeeHTTPErrorV2) as exc_info:
v2_client.enqueue(input_doc, options)
Expand Down
31 changes: 22 additions & 9 deletions tests/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from mindee.error.mimetype_error import MimeTypeError
from mindee.error.mindee_error import MindeeError, MindeeSourceError
from mindee.input.page_options import KEEP_ONLY, REMOVE
from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
from mindee.input.sources.base_64_input import Base64Input
from mindee.input.sources.bytes_input import BytesInput
from mindee.input.sources.file_input import FileInput
Expand Down Expand Up @@ -45,15 +45,8 @@ def test_pdf_reconstruct_no_cut():
assert isinstance(input_file.file_object, io.BufferedReader)


@pytest.mark.parametrize("numb_pages", [1, 2, 3])
def test_pdf_cut_n_pages(numb_pages: int):
input_obj = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
def _assert_pdf_options(input_obj, numb_pages):
assert input_obj.is_pdf() is True
input_obj.process_pdf(
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages]
)
assert input_obj.count_doc_pages() == numb_pages

# Currently the least verbose way of comparing pages with pypdfium2
# I.e. each page is read & rendered as a rasterized image. These images are then compared as raw byte sequences.
cut_pdf = pdfium.PdfDocument(input_obj.file_object)
Expand All @@ -69,6 +62,26 @@ def test_pdf_cut_n_pages(numb_pages: int):
pdf.close()


@pytest.mark.parametrize("numb_pages", [1, 2, 3])
def test_process_pdf_cut_n_pages(numb_pages: int):
input_obj = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
input_obj.process_pdf(
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages]
)
assert input_obj.count_doc_pages() == numb_pages
_assert_pdf_options(input_obj, numb_pages)


@pytest.mark.parametrize("numb_pages", [1, 2, 3])
def test_apply_pages_pdf_cut_n_pages(numb_pages: int):
input_obj = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
input_obj.apply_page_options(
PageOptions(on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages])
)
assert input_obj.count_doc_pages() == numb_pages
_assert_pdf_options(input_obj, numb_pages)


def test_pdf_keep_5_first_pages():
input_obj = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
assert input_obj.is_pdf() is True
Expand Down