diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 68ac36cb..8192a59e 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -568,7 +568,7 @@ def validate_mimetype(cls, v): class RefItem(BaseModel): """RefItem.""" - cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX) + cref: Annotated[str, Field(alias="$ref", pattern=_JSON_POINTER_REGEX)] # This method makes RefItem compatible with DocItem def get_ref(self): @@ -4538,6 +4538,43 @@ def _normalize_references(self) -> None: self.form_items = item_lists["form_items"] # type: ignore self.body = new_body + def _validate_rules(self): + def validate_list_group(doc: DoclingDocument, item: ListGroup): + for ref in item.children: + child = ref.resolve(doc) + if not isinstance(child, ListItem): + raise ValueError( + f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})" + ) + + def validate_list_item(doc: DoclingDocument, item: ListItem): + if item.parent is None: + raise ValueError(f"ListItem {item.self_ref} has no parent") + if not isinstance(item.parent.resolve(doc), ListGroup): + raise ValueError( + f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}" + ) + + def validate_group(doc: DoclingDocument, item: GroupItem): + if ( + item.parent and not item.children + ): # tolerate empty body, but not other groups + raise ValueError(f"Group {item.self_ref} has no children") + + for item, _ in self.iterate_items( + with_groups=True, + traverse_pictures=True, + included_content_layers={c for c in ContentLayer}, + ): + if isinstance(item, ListGroup): + validate_list_group(self, item) + + elif isinstance(item, GroupItem): + validate_group(self, item) + + elif isinstance(item, ListItem): + validate_list_item(self, item) + # deprecated aliases (kept for backwards compatibility): BasePictureData = BaseAnnotation diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index a7ca9abd..57f40e65 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -2,7 +2,7 @@ from collections import deque from copy import deepcopy from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union from unittest.mock import Mock import pytest @@ -1659,3 +1659,92 @@ def test_misplaced_list_items(): else: exp_doc = DoclingDocument.load_from_yaml(exp_file) assert doc == exp_doc + + +def test_list_group_with_list_items(): + good_doc = DoclingDocument(name="") + l1 = good_doc.add_list_group() + good_doc.add_list_item(text="ListItem 1", parent=l1) + good_doc.add_list_item(text="ListItem 2", parent=l1) + + good_doc._validate_rules() + + +def test_list_group_with_non_list_items(): + bad_doc = DoclingDocument(name="") + l1 = bad_doc.add_list_group() + bad_doc.add_list_item(text="ListItem 1", parent=l1) + bad_doc.add_text( + text="non-ListItem in ListGroup", label=DocItemLabel.TEXT, parent=l1 + ) + + with pytest.raises(ValueError): + bad_doc._validate_rules() + + +def test_list_item_outside_list_group(): + def unsafe_add_list_item( + doc: DoclingDocument, + text: str, + enumerated: bool = False, + marker: Optional[str] = None, + orig: Optional[str] = None, + prov: Optional[ProvenanceItem] = None, + parent: Optional[NodeItem] = None, + content_layer: Optional[ContentLayer] = None, + formatting: Optional[Formatting] = None, + hyperlink: Optional[Union[AnyUrl, Path]] = None, + ): + if not parent: + parent = doc.body + + if not orig: + orig = text + + text_index = len(doc.texts) + cref = f"#/texts/{text_index}" + list_item = ListItem( + text=text, + orig=orig, + self_ref=cref, + parent=parent.get_ref(), + enumerated=enumerated, + marker=marker or "", + formatting=formatting, + hyperlink=hyperlink, + ) + if prov: + list_item.prov.append(prov) + if content_layer: + list_item.content_layer = content_layer + + doc.texts.append(list_item) + parent.children.append(RefItem(cref=cref)) + + return list_item + + bad_doc = DoclingDocument(name="") + unsafe_add_list_item(doc=bad_doc, text="ListItem outside ListGroup") + with pytest.raises(ValueError): + bad_doc._validate_rules() + + +def test_list_item_inside_list_group(): + doc = DoclingDocument(name="") + l1 = doc.add_list_group() + doc.add_list_item(text="ListItem inside ListGroup", parent=l1) + doc._validate_rules() + + +def test_group_with_children(): + good_doc = DoclingDocument(name="") + grp = good_doc.add_group() + good_doc.add_text(text="Text in group", label=DocItemLabel.TEXT, parent=grp) + good_doc._validate_rules() + + +def test_group_without_children(): + bad_doc = DoclingDocument(name="") + bad_doc.add_group() + with pytest.raises(ValueError): + bad_doc._validate_rules()