Skip to content

Commit e5cfec4

Browse files
committed
chore: add DoclingDocument validation rules
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1 parent 99eabb3 commit e5cfec4

File tree

3 files changed

+145
-10
lines changed

3 files changed

+145
-10
lines changed

docling_core/types/doc/document.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5815,6 +5815,43 @@ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
58155815
res_doc._update_from_index(doc_index)
58165816
return res_doc
58175817

5818+
def _validate_rules(self):
5819+
def validate_list_group(doc: DoclingDocument, item: ListGroup):
5820+
for ref in item.children:
5821+
child = ref.resolve(doc)
5822+
if not isinstance(child, ListItem):
5823+
raise ValueError(
5824+
f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
5825+
)
5826+
5827+
def validate_list_item(doc: DoclingDocument, item: ListItem):
5828+
if item.parent is None:
5829+
raise ValueError(f"ListItem {item.self_ref} has no parent")
5830+
if not isinstance(item.parent.resolve(doc), ListGroup):
5831+
raise ValueError(
5832+
f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
5833+
)
5834+
5835+
def validate_group(doc: DoclingDocument, item: GroupItem):
5836+
if (
5837+
item.parent and not item.children
5838+
): # tolerate empty body, but not other groups
5839+
raise ValueError(f"Group {item.self_ref} has no children")
5840+
5841+
for item, _ in self.iterate_items(
5842+
with_groups=True,
5843+
traverse_pictures=True,
5844+
included_content_layers={c for c in ContentLayer},
5845+
):
5846+
if isinstance(item, ListGroup):
5847+
validate_list_group(self, item)
5848+
5849+
elif isinstance(item, GroupItem):
5850+
validate_group(self, item)
5851+
5852+
elif isinstance(item, ListItem):
5853+
validate_list_item(self, item)
5854+
58185855

58195856
# deprecated aliases (kept for backwards compatibility):
58205857
BasePictureData = BaseAnnotation

test/data/doc/concatenated.html

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
<!DOCTYPE html>
22
<html>
33
<head>
4-
<meta charset="UTF-8">
4+
<meta charset="UTF-8"/>
55
<title>2501.17887v1 + Untitled 1 + 2311.18481v1</title>
6-
<meta name="generator" content="Docling HTML Serializer">
6+
<meta name="generator" content="Docling HTML Serializer"/>
77
<style>
88
html {
99
background-color: #e1e1e1;
@@ -328,11 +328,13 @@ <h2>1. Introduction</h2>
328328
<ul>
329329
<li style="list-style-type: '■ ';">list item 1</li>
330330
<li style="list-style-type: '■ ';">list item 2</li>
331-
<li style="list-style-type: '■ ';">list item 3
331+
<li style="list-style-type: '■ ';">
332+
list item 3
332333
<ol>
333334
<li>list item 3.a</li>
334335
<li>list item 3.b</li>
335-
<li>list item 3.c
336+
<li>
337+
list item 3.c
336338
<ol>
337339
<li>list item 3.c.i</li>
338340
</ol>
@@ -356,11 +358,16 @@ <h2>1. Introduction</h2>
356358
</ul>
357359
<ul>
358360
<li style="list-style-type: '■ ';">item 1 of neighboring list</li>
359-
<li style="list-style-type: '■ ';">item 2 of neighboring list
361+
<li style="list-style-type: '■ ';">
362+
item 2 of neighboring list
360363
<ul>
361364
<li style="list-style-type: '□ ';">item 1 of sub list</li>
362-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span></li>
363-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
365+
<li style="list-style-type: '□ ';">
366+
<span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span>
367+
</li>
368+
<li style="list-style-type: '□ ';">
369+
<span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span>
370+
</li>
364371
</ul>
365372
</li>
366373
</ul>
@@ -384,10 +391,12 @@ <h2>1. Introduction</h2>
384391
<ol>
385392
<li style="list-style-type: '(i) ';">Item 1 in A</li>
386393
<li style="list-style-type: '(ii) ';">Item 2 in A</li>
387-
<li style="list-style-type: '(iii) ';">Item 3 in A
394+
<li style="list-style-type: '(iii) ';">
395+
Item 3 in A
388396
<ol>
389397
<li>Item 1 in B</li>
390-
<li style="list-style-type: '42. ';">Item 2 in B
398+
<li style="list-style-type: '42. ';">
399+
Item 2 in B
391400
<ol>
392401
<li>Item 1 in C</li>
393402
<li>Item 2 in C</li>

test/test_docling_doc.py

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from collections import deque
33
from copy import deepcopy
44
from pathlib import Path
5-
from typing import List, Optional
5+
from typing import List, Optional, Union
66
from unittest.mock import Mock
77

88
import pytest
@@ -2003,3 +2003,92 @@ def test_concatenate():
20032003
with open(exp_html_file, "r", encoding="utf-8") as f:
20042004
exp_html_data = f.read()
20052005
assert html_data == exp_html_data
2006+
2007+
2008+
def test_list_group_with_list_items():
2009+
good_doc = DoclingDocument(name="")
2010+
l1 = good_doc.add_list_group()
2011+
good_doc.add_list_item(text="ListItem 1", parent=l1)
2012+
good_doc.add_list_item(text="ListItem 2", parent=l1)
2013+
2014+
good_doc._validate_rules()
2015+
2016+
2017+
def test_list_group_with_non_list_items():
2018+
bad_doc = DoclingDocument(name="")
2019+
l1 = bad_doc.add_list_group()
2020+
bad_doc.add_list_item(text="ListItem 1", parent=l1)
2021+
bad_doc.add_text(
2022+
text="non-ListItem in ListGroup", label=DocItemLabel.TEXT, parent=l1
2023+
)
2024+
2025+
with pytest.raises(ValueError):
2026+
bad_doc._validate_rules()
2027+
2028+
2029+
def test_list_item_outside_list_group():
2030+
def unsafe_add_list_item(
2031+
doc: DoclingDocument,
2032+
text: str,
2033+
enumerated: bool = False,
2034+
marker: Optional[str] = None,
2035+
orig: Optional[str] = None,
2036+
prov: Optional[ProvenanceItem] = None,
2037+
parent: Optional[NodeItem] = None,
2038+
content_layer: Optional[ContentLayer] = None,
2039+
formatting: Optional[Formatting] = None,
2040+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
2041+
):
2042+
if not parent:
2043+
parent = doc.body
2044+
2045+
if not orig:
2046+
orig = text
2047+
2048+
text_index = len(doc.texts)
2049+
cref = f"#/texts/{text_index}"
2050+
list_item = ListItem(
2051+
text=text,
2052+
orig=orig,
2053+
self_ref=cref,
2054+
parent=parent.get_ref(),
2055+
enumerated=enumerated,
2056+
marker=marker or "",
2057+
formatting=formatting,
2058+
hyperlink=hyperlink,
2059+
)
2060+
if prov:
2061+
list_item.prov.append(prov)
2062+
if content_layer:
2063+
list_item.content_layer = content_layer
2064+
2065+
doc.texts.append(list_item)
2066+
parent.children.append(RefItem(cref=cref))
2067+
2068+
return list_item
2069+
2070+
bad_doc = DoclingDocument(name="")
2071+
unsafe_add_list_item(doc=bad_doc, text="ListItem outside ListGroup")
2072+
with pytest.raises(ValueError):
2073+
bad_doc._validate_rules()
2074+
2075+
2076+
def test_list_item_inside_list_group():
2077+
doc = DoclingDocument(name="")
2078+
l1 = doc.add_list_group()
2079+
doc.add_list_item(text="ListItem inside ListGroup", parent=l1)
2080+
doc._validate_rules()
2081+
2082+
2083+
def test_group_with_children():
2084+
good_doc = DoclingDocument(name="")
2085+
grp = good_doc.add_group()
2086+
good_doc.add_text(text="Text in group", label=DocItemLabel.TEXT, parent=grp)
2087+
good_doc._validate_rules()
2088+
2089+
2090+
def test_group_without_children():
2091+
bad_doc = DoclingDocument(name="")
2092+
bad_doc.add_group()
2093+
with pytest.raises(ValueError):
2094+
bad_doc._validate_rules()

0 commit comments

Comments
 (0)