From e3637b1ca6ff33493754821177b2be53332a6948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?ianar=C3=A9=20s=C3=A9vi?= Date: Mon, 24 Nov 2025 18:08:39 +0100 Subject: [PATCH] :sparkles: add data schema override to v2 --- mindee/input/inference_parameters.py | 44 +++++- mindee/mindee_http/mindee_api_v2.py | 4 +- mindee/parsing/v2/inference_active_options.py | 16 +++ tests/data | 2 +- tests/v2/input/test_local_response.py | 2 +- tests/v2/parsing/test_inference_response.py | 134 ++++++++---------- tests/v2/test_client.py | 7 +- tests/v2/test_client_integration.py | 90 ++++++++---- 8 files changed, 190 insertions(+), 109 deletions(-) diff --git a/mindee/input/inference_parameters.py b/mindee/input/inference_parameters.py index 0df0495a..dd57c38a 100644 --- a/mindee/input/inference_parameters.py +++ b/mindee/input/inference_parameters.py @@ -1,9 +1,41 @@ +import json from dataclasses import dataclass -from typing import List, Optional +from typing import List, Optional, Union from mindee.input.polling_options import PollingOptions +class DataSchema: + """Modify the Data Schema.""" + + _replace: Optional[dict] = None + + def __init__(self, replace: Optional[dict] = None): + self._replace = replace + + @property + def replace(self): + """If set, completely replaces the data schema of the model.""" + return self._replace + + @replace.setter + def replace(self, value: Optional[Union[dict, str]]) -> None: + if value is None: + _replace = None + elif isinstance(value, str): + _replace = json.loads(value) + elif isinstance(value, dict): + _replace = value + else: + raise TypeError("Invalid type for data schema") + if _replace is not None and _replace == {}: + raise ValueError("Empty override provided") + self._replace = _replace + + def __str__(self) -> str: + return json.dumps({"replace": self.replace}) + + @dataclass class InferenceParameters: """Inference parameters to set when sending a file.""" @@ -30,4 +62,12 @@ class InferenceParameters: close_file: bool = True """Whether to close the file after parsing.""" text_context: Optional[str] = None - """Additional text context used by the model during inference. Not recommended, for specific use only.""" + """ + Additional text context used by the model during inference. + Not recommended, for specific use only. + """ + data_schema: Optional[DataSchema] = None + """ + Dynamic changes to the data schema of the model for this inference. + Not recommended, for specific use only. + """ diff --git a/mindee/mindee_http/mindee_api_v2.py b/mindee/mindee_http/mindee_api_v2.py index a8e380c8..9990330c 100644 --- a/mindee/mindee_http/mindee_api_v2.py +++ b/mindee/mindee_http/mindee_api_v2.py @@ -98,8 +98,10 @@ def req_post_inference_enqueue( data["webhook_ids"] = params.webhook_ids if params.alias and len(params.alias): data["alias"] = params.alias - if params.text_context and (params.text_context): + if params.text_context and len(params.text_context): data["text_context"] = params.text_context + if params.data_schema is not None: + data["data_schema"] = str(params.data_schema) if isinstance(input_source, LocalInputSource): files = {"file": input_source.read_contents(params.close_file)} diff --git a/mindee/parsing/v2/inference_active_options.py b/mindee/parsing/v2/inference_active_options.py index 2891173c..d94749a2 100644 --- a/mindee/parsing/v2/inference_active_options.py +++ b/mindee/parsing/v2/inference_active_options.py @@ -1,6 +1,18 @@ from mindee.parsing.common.string_dict import StringDict +class DataSchemaActiveOptions: + """Data schema options activated during the inference.""" + + replace: bool + + def __init__(self, raw_response: StringDict): + self.replace = raw_response["replace"] + + def __str__(self) -> str: + return f"Data Schema\n-----------\n:Replace: {self.replace}" + + class InferenceActiveOptions: """Active options for the inference.""" @@ -29,6 +41,8 @@ class InferenceActiveOptions: Whether the text context feature was activated. When this feature is activated, the provided context is used to improve the accuracy of the inference. """ + data_schema: DataSchemaActiveOptions + """Data schema options provided for the inference.""" def __init__(self, raw_response: StringDict): self.raw_text = raw_response["raw_text"] @@ -36,6 +50,7 @@ def __init__(self, raw_response: StringDict): self.confidence = raw_response["confidence"] self.rag = raw_response["rag"] self.text_context = raw_response["text_context"] + self.data_schema = DataSchemaActiveOptions(raw_response["data_schema"]) def __str__(self) -> str: return ( @@ -44,4 +59,5 @@ def __str__(self) -> str: f"\n:Polygon: {self.polygon}" f"\n:Confidence: {self.confidence}" f"\n:RAG: {self.rag}" + f"\n:Text Context: {self.text_context}" ) diff --git a/tests/data b/tests/data index f86f3eaf..7560dd55 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit f86f3eaf540f0babeb3d4f1a458d764856a2170b +Subproject commit 7560dd5532c10b4d3fb85991f386e9809dd2750a diff --git a/tests/v2/input/test_local_response.py b/tests/v2/input/test_local_response.py index e86d7877..5ce07fe1 100644 --- a/tests/v2/input/test_local_response.py +++ b/tests/v2/input/test_local_response.py @@ -14,7 +14,7 @@ def file_path() -> Path: def _assert_local_response(local_response): fake_hmac_signing = "ogNjY44MhvKPGTtVsI8zG82JqWQa68woYQH" - signature = "b82a515c832fd2c4f4ce3a7e6f53c12e8d10e19223f6cf0e3a9809a7a3da26be" + signature = "1df388c992d87897fe61dfc56c444c58fc3c7369c31e2b5fd20d867695e93e85" assert local_response._file is not None assert not local_response.is_valid_hmac_signature( diff --git a/tests/v2/parsing/test_inference_response.py b/tests/v2/parsing/test_inference_response.py index f494bb48..188ede49 100644 --- a/tests/v2/parsing/test_inference_response.py +++ b/tests/v2/parsing/test_inference_response.py @@ -41,54 +41,46 @@ def _get_product_samples(product, name: str) -> Tuple[dict, str]: @pytest.mark.v2 def test_deep_nested_fields(): json_sample, rst_sample = _get_inference_samples("deep_nested_fields") - inference_result = InferenceResponse(json_sample) - assert isinstance(inference_result.inference, Inference) - assert isinstance( - inference_result.inference.result.fields["field_simple"], SimpleField - ) - assert isinstance( - inference_result.inference.result.fields["field_object"], ObjectField - ) + response = InferenceResponse(json_sample) + assert isinstance(response.inference, Inference) + assert isinstance(response.inference.result.fields["field_simple"], SimpleField) + assert isinstance(response.inference.result.fields["field_object"], ObjectField) assert isinstance( - inference_result.inference.result.fields["field_object"].fields[ - "sub_object_list" - ], + response.inference.result.fields["field_object"].fields["sub_object_list"], ListField, ) assert isinstance( - inference_result.inference.result.fields["field_object"].fields[ - "sub_object_object" - ], + response.inference.result.fields["field_object"].fields["sub_object_object"], ObjectField, ) assert isinstance( - inference_result.inference.result.fields["field_object"] + response.inference.result.fields["field_object"] .fields["sub_object_object"] .fields, dict, ) assert isinstance( - inference_result.inference.result.fields["field_object"] + response.inference.result.fields["field_object"] .fields["sub_object_object"] .fields["sub_object_object_sub_object_list"], ListField, ) assert isinstance( - inference_result.inference.result.fields["field_object"] + response.inference.result.fields["field_object"] .fields["sub_object_object"] .fields["sub_object_object_sub_object_list"] .items, list, ) assert isinstance( - inference_result.inference.result.fields["field_object"] + response.inference.result.fields["field_object"] .fields["sub_object_object"] .fields["sub_object_object_sub_object_list"] .items[0], ObjectField, ) assert isinstance( - inference_result.inference.result.fields["field_object"] + response.inference.result.fields["field_object"] .fields["sub_object_object"] .fields["sub_object_object_sub_object_list"] .items[0] @@ -96,7 +88,7 @@ def test_deep_nested_fields(): SimpleField, ) assert ( - inference_result.inference.result.fields["field_object"] + response.inference.result.fields["field_object"] .fields["sub_object_object"] .fields["sub_object_object_sub_object_list"] .items[0] @@ -109,43 +101,41 @@ def test_deep_nested_fields(): @pytest.mark.v2 def test_standard_field_types(): json_sample, rst_sample = _get_inference_samples("standard_field_types") - inference_result = InferenceResponse(json_sample) - assert isinstance(inference_result.inference, Inference) - field_simple_string = inference_result.inference.result.fields[ - "field_simple_string" - ] + response = InferenceResponse(json_sample) + assert isinstance(response.inference, Inference) + field_simple_string = response.inference.result.fields["field_simple_string"] assert isinstance(field_simple_string, SimpleField) assert field_simple_string.value == "field_simple_string-value" assert field_simple_string.confidence == FieldConfidence.CERTAIN assert str(field_simple_string) == "field_simple_string-value" - field_simple_int = inference_result.inference.result.fields["field_simple_int"] + field_simple_int = response.inference.result.fields["field_simple_int"] assert isinstance(field_simple_int, SimpleField) assert isinstance(field_simple_int.value, float) - field_simple_float = inference_result.inference.result.fields["field_simple_float"] + field_simple_float = response.inference.result.fields["field_simple_float"] assert isinstance(field_simple_float, SimpleField) assert isinstance(field_simple_float.value, float) - field_simple_bool = inference_result.inference.result.fields["field_simple_bool"] + field_simple_bool = response.inference.result.fields["field_simple_bool"] assert isinstance(field_simple_bool, SimpleField) assert field_simple_bool.value is True assert str(field_simple_bool) == "True" - field_simple_null = inference_result.inference.result.fields["field_simple_null"] + field_simple_null = response.inference.result.fields["field_simple_null"] assert isinstance(field_simple_null, SimpleField) assert field_simple_null.value is None assert str(field_simple_null) == "" - assert rst_sample == str(inference_result) + assert rst_sample == str(response) @pytest.mark.v2 def test_standard_field_object(): json_sample, _ = _get_inference_samples("standard_field_types") - inference_result = InferenceResponse(json_sample) + response = InferenceResponse(json_sample) - object_field = inference_result.inference.result.fields["field_object"] + object_field = response.inference.result.fields["field_object"] assert isinstance(object_field, ObjectField) sub_fields = object_field.fields @@ -163,10 +153,10 @@ def test_standard_field_object(): @pytest.mark.v2 def test_standard_field_object_list(): json_sample, _ = _get_inference_samples("standard_field_types") - inference_result = InferenceResponse(json_sample) - assert isinstance(inference_result.inference, Inference) + response = InferenceResponse(json_sample) + assert isinstance(response.inference, Inference) - field_object_list = inference_result.inference.result.fields["field_object_list"] + field_object_list = response.inference.result.fields["field_object_list"] assert isinstance(field_object_list, ListField) assert len(field_object_list.items) == 2 for object_field in field_object_list.object_items: @@ -176,10 +166,10 @@ def test_standard_field_object_list(): @pytest.mark.v2 def test_standard_field_simple_list(): json_sample, _ = _get_inference_samples("standard_field_types") - inference_result = InferenceResponse(json_sample) - assert isinstance(inference_result.inference, Inference) + response = InferenceResponse(json_sample) + assert isinstance(response.inference, Inference) - field_simple_list = inference_result.inference.result.fields["field_simple_list"] + field_simple_list = response.inference.result.fields["field_simple_list"] assert isinstance(field_simple_list, ListField) assert len(field_simple_list.simple_items) == 2 for object_field in field_simple_list.simple_items: @@ -189,15 +179,16 @@ def test_standard_field_simple_list(): @pytest.mark.v2 def test_raw_texts(): json_sample, _ = _get_inference_samples("raw_texts") - inference_result = InferenceResponse(json_sample) - assert isinstance(inference_result.inference, Inference) + response = InferenceResponse(json_sample) + assert isinstance(response.inference, Inference) - assert inference_result.inference.result.raw_text - assert len(inference_result.inference.result.raw_text.pages) == 2 + assert response.inference.result.raw_text + assert len(response.inference.result.raw_text.pages) == 2 assert ( - inference_result.inference.result.raw_text.pages[0].content + response.inference.result.raw_text.pages[0].content == "This is the raw text of the first page..." ) + assert response.inference.active_options.raw_text is True @pytest.mark.v2 @@ -208,6 +199,7 @@ def test_rag_metadata_when_matched(): rag = response.inference.result.rag assert isinstance(rag, RagMetadata) assert rag.retrieved_document_id == "12345abc-1234-1234-1234-123456789abc" + assert response.inference.active_options.rag is True @pytest.mark.v2 @@ -218,41 +210,37 @@ def test_rag_metadata_when_not_matched(): rag = response.inference.result.rag assert isinstance(rag, RagMetadata) assert rag.retrieved_document_id is None + assert response.inference.active_options.rag is True @pytest.mark.v2 def test_full_inference_response(): json_sample, rst_sample = _get_product_samples("financial_document", "complete") - inference_result = InferenceResponse(json_sample) + response = InferenceResponse(json_sample) - assert isinstance(inference_result.inference, Inference) - assert inference_result.inference.id == "12345678-1234-1234-1234-123456789abc" - assert isinstance(inference_result.inference.result.fields["date"], SimpleField) - assert inference_result.inference.result.fields["date"].value == "2019-11-02" - assert isinstance(inference_result.inference.result.fields["taxes"], ListField) - assert isinstance( - inference_result.inference.result.fields["taxes"].items[0], ObjectField - ) + assert isinstance(response.inference, Inference) + assert response.inference.id == "12345678-1234-1234-1234-123456789abc" + assert isinstance(response.inference.result.fields["date"], SimpleField) + assert response.inference.result.fields["date"].value == "2019-11-02" + assert isinstance(response.inference.result.fields["taxes"], ListField) + assert isinstance(response.inference.result.fields["taxes"].items[0], ObjectField) assert ( - inference_result.inference.result.fields["customer_address"] - .fields["city"] - .value + response.inference.result.fields["customer_address"].fields["city"].value == "New York" ) assert ( - inference_result.inference.result.fields["taxes"].items[0].fields["base"].value - == 31.5 + response.inference.result.fields["taxes"].items[0].fields["base"].value == 31.5 ) - assert isinstance(inference_result.inference.model, InferenceModel) - assert inference_result.inference.model.id == "12345678-1234-1234-1234-123456789abc" + assert isinstance(response.inference.model, InferenceModel) + assert response.inference.model.id == "12345678-1234-1234-1234-123456789abc" - assert isinstance(inference_result.inference.file, InferenceFile) - assert inference_result.inference.file.name == "complete.jpg" - assert inference_result.inference.file.page_count == 1 - assert inference_result.inference.file.mime_type == "image/jpeg" - assert not inference_result.inference.file.alias - assert not inference_result.inference.result.raw_text + assert isinstance(response.inference.file, InferenceFile) + assert response.inference.file.name == "complete.jpg" + assert response.inference.file.page_count == 1 + assert response.inference.file.mime_type == "image/jpeg" + assert not response.inference.file.alias + assert not response.inference.result.raw_text @pytest.mark.v2 @@ -265,9 +253,9 @@ def test_field_locations_and_confidence() -> None: "financial_document", "complete_with_coordinates" ) - inference_result = InferenceResponse(json_sample) + response = InferenceResponse(json_sample) - date_field: SimpleField = inference_result.inference.result.fields["date"] + date_field: SimpleField = response.inference.result.fields["date"] assert date_field.locations, "date field should expose locations" location = date_field.locations[0] @@ -304,15 +292,15 @@ def test_field_locations_and_confidence() -> None: @pytest.mark.v2 def test_text_context_field_is_false() -> None: json_sample, _ = _get_product_samples("financial_document", "complete") - inference_result = InferenceResponse(json_sample) - assert isinstance(inference_result.inference.active_options, InferenceActiveOptions) - assert inference_result.inference.active_options.text_context is False + response = InferenceResponse(json_sample) + assert isinstance(response.inference.active_options, InferenceActiveOptions) + assert response.inference.active_options.text_context is False @pytest.mark.v2 def test_text_context_field_is_true() -> None: with open(V2_DATA_DIR / "inference" / "text_context_enabled.json", "r") as file: json_sample = json.load(file) - inference_result = InferenceResponse(json_sample) - assert isinstance(inference_result.inference.active_options, InferenceActiveOptions) - assert inference_result.inference.active_options.text_context is True + response = InferenceResponse(json_sample) + assert isinstance(response.inference.active_options, InferenceActiveOptions) + assert response.inference.active_options.text_context is True diff --git a/tests/v2/test_client.py b/tests/v2/test_client.py index 15d4f583..7d6453e7 100644 --- a/tests/v2/test_client.py +++ b/tests/v2/test_client.py @@ -7,6 +7,7 @@ from mindee.error.mindee_error import MindeeApiV2Error, MindeeError from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2 from mindee.input import LocalInputSource, PathInput +from mindee.input.inference_parameters import DataSchema from mindee.mindee_http.base_settings import USER_AGENT from mindee.parsing.v2.inference import Inference from mindee.parsing.v2.job import Job @@ -137,7 +138,11 @@ def test_enqueue_and_parse_path_with_env_token(custom_base_url_client): with pytest.raises(MindeeHTTPErrorV2): custom_base_url_client.enqueue_and_get_inference( input_doc, - InferenceParameters("dummy-model", text_context="ignore this message"), + InferenceParameters( + "dummy-model", + text_context="ignore this message", + data_schema=DataSchema(replace={"test_field": {}}), + ), ) diff --git a/tests/v2/test_client_integration.py b/tests/v2/test_client_integration.py index 3203cebf..e1d45bbf 100644 --- a/tests/v2/test_client_integration.py +++ b/tests/v2/test_client_integration.py @@ -6,6 +6,7 @@ from mindee import ClientV2, InferenceParameters, PathInput, UrlInputSource from mindee.error.mindee_http_error_v2 import MindeeHTTPErrorV2 from mindee.parsing.v2 import InferenceActiveOptions +from mindee.input.inference_parameters import DataSchema from mindee.parsing.v2.inference_response import InferenceResponse from tests.utils import FILE_TYPES_DIR, V2_PRODUCT_DATA_DIR @@ -26,6 +27,22 @@ def v2_client() -> ClientV2: return ClientV2(api_key) +def _basic_assert_success( + response: InferenceResponse, page_count: int, model_id: str +) -> None: + assert response is not None + assert response.inference is not None + + assert response.inference.file is not None + assert response.inference.file.page_count == page_count + + assert response.inference.model is not None + assert response.inference.model.id == model_id + + assert response.inference.result is not None + assert response.inference.active_options is not None + + @pytest.mark.integration @pytest.mark.v2 def test_parse_file_empty_multiple_pages_must_succeed( @@ -50,12 +67,9 @@ def test_parse_file_empty_multiple_pages_must_succeed( response: InferenceResponse = v2_client.enqueue_and_get_inference( input_source, params ) - assert response is not None - assert response.inference is not None + _basic_assert_success(response=response, page_count=2, model_id=findoc_model_id) - assert response.inference.file is not None assert response.inference.file.name == "multipage_cut-2.pdf" - assert response.inference.file.page_count == 2 assert response.inference.model is not None assert response.inference.model.id == findoc_model_id @@ -68,8 +82,6 @@ def test_parse_file_empty_multiple_pages_must_succeed( assert response.inference.active_options.confidence is False assert response.inference.active_options.text_context is False - assert response.inference.result is not None - assert response.inference.result.raw_text is not None assert len(response.inference.result.raw_text.pages) == 2 @@ -96,15 +108,9 @@ def test_parse_file_empty_single_page_options_must_succeed( response: InferenceResponse = v2_client.enqueue_and_get_inference( input_source, params ) - assert response is not None - assert response.inference is not None + _basic_assert_success(response=response, page_count=1, model_id=findoc_model_id) - assert response.inference.model is not None - assert response.inference.model.id == findoc_model_id - - assert response.inference.file is not None assert response.inference.file.name == "blank_1.pdf" - assert response.inference.file.page_count == 1 assert isinstance(response.inference.active_options, InferenceActiveOptions) assert response.inference.active_options is not None @@ -114,8 +120,6 @@ def test_parse_file_empty_single_page_options_must_succeed( assert response.inference.active_options.confidence is True assert response.inference.active_options.text_context is False - assert response.inference.result is not None - @pytest.mark.integration @pytest.mark.v2 @@ -142,13 +146,9 @@ def test_parse_file_filled_single_page_must_succeed( response: InferenceResponse = v2_client.enqueue_and_get_inference( input_source, params ) + _basic_assert_success(response=response, page_count=1, model_id=findoc_model_id) - assert response is not None - assert response.inference is not None - - assert response.inference.file is not None assert response.inference.file.name == "default_sample.jpg" - assert response.inference.file.page_count == 1 assert response.inference.model is not None assert response.inference.model.id == findoc_model_id @@ -163,7 +163,6 @@ def test_parse_file_filled_single_page_must_succeed( assert response.inference.result.raw_text is None - assert response.inference.result is not None supplier_name = response.inference.result.fields["supplier_name"] assert supplier_name is not None assert supplier_name.value == "John Smith" @@ -273,15 +272,46 @@ def test_blank_url_input_source_must_succeed( response: InferenceResponse = v2_client.enqueue_and_get_inference( input_source, params ) - assert response is not None - assert response.inference is not None - - assert response.inference.file is not None - assert response.inference.file.page_count == 1 + _basic_assert_success(response=response, page_count=1, model_id=findoc_model_id) - assert response.inference.model is not None - assert response.inference.model.id == findoc_model_id - assert response.inference.result is not None +@pytest.mark.integration +@pytest.mark.v2 +def test_data_schema_must_succeed( + v2_client: ClientV2, + findoc_model_id: str, +) -> None: + """ + Load a blank PDF from an HTTPS URL and make sure the inference call completes without raising any errors. + """ + input_path: Path = FILE_TYPES_DIR / "pdf" / "blank_1.pdf" - assert response.inference.active_options is not None + input_source = PathInput(input_path) + params = InferenceParameters( + model_id=findoc_model_id, + rag=False, + raw_text=False, + polygon=False, + confidence=False, + webhook_ids=[], + data_schema=DataSchema( + replace={ + "fields": [ + { + "name": "test", + "title": "Test", + "is_array": False, + "type": "string", + "description": "A test field", + } + ] + } + ), + alias="py_integration_data_schema_override", + ) + response: InferenceResponse = v2_client.enqueue_and_get_inference( + input_source, params + ) + _basic_assert_success(response=response, page_count=1, model_id=findoc_model_id) + assert response.inference.active_options.data_schema.replace is True + assert response.inference.result.fields["test"] is not None