♻️ update structure of raw texts (#333)

ianardee · web-flow · commit d9607aae74b3 · 2025-07-10T09:55:00.000+02:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
         ]
 
   - repo: https://github.com/gitleaks/gitleaks
-    rev: v8.18.2
+    rev: v8.18.4
     hooks:
       - id: gitleaks
 
diff --git a/mindee/parsing/v2/inference_options.py b/mindee/parsing/v2/inference_options.py
@@ -1,12 +1,23 @@
-from typing import List, Optional
+from typing import List
 
 from mindee.parsing.common.string_dict import StringDict
 
 
+class RawText:
+    """Raw text extracted from the document."""
+
+    page: int
+    content: str
+
+    def __init__(self, raw_response: StringDict):
+        self.page = raw_response["page"]
+        self.content = raw_response["content"]
+
+
 class InferenceOptions:
     """Optional information about the document."""
 
-    raw_text: Optional[List[str]]
+    raw_texts: List[RawText]
 
     def __init__(self, raw_response: StringDict):
-        self.raw_text = raw_response["raw_text"] if "raw_text" in raw_response else None
+        self.raw_texts = [RawText(raw_text) for raw_text in raw_response["raw_texts"]]
diff --git a/tests/data b/tests/data
@@ -1 +1 @@
-Subproject commit f599a960e78f4a390984c6263f387aa8cdebe0f0
+Subproject commit 2e2788376cd0dd6168f1917129588fab6089378d
diff --git a/tests/v2/test_inference_response.py b/tests/v2/test_inference_response.py
@@ -3,7 +3,6 @@
 import pytest
 
 from mindee import ClientV2, LocalResponse
-from mindee.parsing.common.string_dict import StringDict
 from mindee.parsing.v2 import (
     Inference,
     InferenceFile,
@@ -17,90 +16,30 @@
 
 
 @pytest.fixture
-def inference_result_json() -> StringDict:
-    return {
-        "inference": {
-            "model": {"id": "test-model-id"},
-            "file": {"name": "test-file-name.jpg", "alias": None},
-            "result": {
-                "fields": {
-                    "field_simple": {"value": "value_1"},
-                    "field_object": {
-                        "fields": {
-                            "sub_object_simple": {"value": "value_2"},
-                            "sub_object_list": {
-                                "items": [
-                                    {
-                                        "fields": {
-                                            "sub_object_list_sub_list_simple": {
-                                                "value": "value_3"
-                                            }
-                                        }
-                                    },
-                                    {
-                                        "fields": {
-                                            "sub_object_list_sub_list_object_subobject_1": {
-                                                "value": "value_4"
-                                            },
-                                            "sub_object_list_sub_list_object_subobject_2": {
-                                                "value": "value_5"
-                                            },
-                                        }
-                                    },
-                                ]
-                            },
-                            "sub_object_object": {
-                                "fields": {
-                                    "sub_object_object_sub_object_simple": {
-                                        "value": "value_6"
-                                    },
-                                    "sub_object_object_sub_object_object": {
-                                        "fields": {
-                                            "sub_object_object_sub_object_object_simple_1": {
-                                                "value": "value_7"
-                                            },
-                                            "sub_object_object_sub_object_object_simple_2": {
-                                                "value": "value_8"
-                                            },
-                                        }
-                                    },
-                                    "sub_object_object_sub_object_list": {
-                                        "items": [
-                                            {
-                                                "fields": {
-                                                    "sub_object_object_sub_object_list_simple": {
-                                                        "value": "value_9"
-                                                    },
-                                                    "sub_object_object_sub_object_list_object": {
-                                                        "fields": {
-                                                            "sub_object_object_sub_object_list_object_subobject_1": {
-                                                                "value": "value_10"
-                                                            },
-                                                            "sub_object_object_sub_object_list_object_subobject_2": {
-                                                                "value": "value_11"
-                                                            },
-                                                        }
-                                                    },
-                                                }
-                                            }
-                                        ]
-                                    },
-                                }
-                            },
-                        }
-                    },
-                },
-                "options": {
-                    "raw_text": ["toto", "tata", "titi"],
-                },
-            },
-        }
-    }
+def deep_nested_fields() -> dict:
+    with (V2_DATA_DIR / "inference/deep_nested_fields.json").open(
+        "r", encoding="utf-8"
+    ) as fh:
+        return json.load(fh)
+
+
+@pytest.fixture
+def standard_field_types() -> dict:
+    with (V2_DATA_DIR / "inference/standard_field_types.json").open(
+        "r", encoding="utf-8"
+    ) as fh:
+        return json.load(fh)
+
+
+@pytest.fixture
+def raw_texts() -> dict:
+    with (V2_DATA_DIR / "inference/raw_texts.json").open("r", encoding="utf-8") as fh:
+        return json.load(fh)
 
 
 @pytest.mark.v2
-def test_inference_response(inference_result_json):
-    inference_result = InferenceResponse(inference_result_json)
+def test_deep_nested_fields(deep_nested_fields):
+    inference_result = InferenceResponse(deep_nested_fields)
     assert isinstance(inference_result.inference, Inference)
     assert isinstance(
         inference_result.inference.result.fields.field_simple, SimpleField
@@ -166,9 +105,37 @@ def test_inference_response(inference_result_json):
         == "value_9"
     )
 
+
+@pytest.mark.v2
+def test_deep_nested_fields(standard_field_types):
+    inference_result = InferenceResponse(standard_field_types)
+    assert isinstance(inference_result.inference, Inference)
+    assert isinstance(
+        inference_result.inference.result.fields.field_simple, SimpleField
+    )
+    assert isinstance(
+        inference_result.inference.result.fields.field_object, ObjectField
+    )
+    assert isinstance(
+        inference_result.inference.result.fields.field_simple_list, ListField
+    )
+    assert isinstance(
+        inference_result.inference.result.fields.field_object_list, ListField
+    )
+
+
+@pytest.mark.v2
+def test_raw_texts(raw_texts):
+    inference_result = InferenceResponse(raw_texts)
+    assert isinstance(inference_result.inference, Inference)
+
     assert inference_result.inference.result.options
-    assert len(inference_result.inference.result.options.raw_text) == 3
-    assert inference_result.inference.result.options.raw_text[0] == "toto"
+    assert len(inference_result.inference.result.options.raw_texts) == 2
+    assert inference_result.inference.result.options.raw_texts[0].page == 0
+    assert (
+        inference_result.inference.result.options.raw_texts[0].content
+        == "This is the raw text of the first page..."
+    )
 
 
 @pytest.mark.v2

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ repos:`
`22`	`22`	`]`
`23`	`23`
`24`	`24`	`- repo: https://github.com/gitleaks/gitleaks`
`25`		`- rev: v8.18.2`
	`25`	`+ rev: v8.18.4`
`26`	`26`	`hooks:`
`27`	`27`	`- id: gitleaks`
`28`	`28`