From 072e6fb1a4d234beeee21a25bf8b54f34377e12a Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 1 Sep 2025 07:24:02 +0200 Subject: [PATCH 1/2] implementing the form structure based on the new iso-standard Signed-off-by: Peter Staar --- docling_core/types/doc/document.py | 60 ++++++++++++++++++++++++++---- docling_core/types/doc/labels.py | 4 ++ 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 9d8dc332..447278e6 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1921,13 +1921,58 @@ def export_to_document_tokens( return text +class FormHeaderItem(SectionHeaderItem): + """FormHeaderItem.""" + + label: typing.Literal[DocItemLabel.FORM_HEADER] = DocItemLabel.FORM_HEADER + +class FormTextItem(TextItem): + """FormTextItem.""" + + label: typing.Literal[DocItemLabel.FORM_TEXT] = DocItemLabel.FORM_TEXT + +class FormListItem(TextItem): + """FormListItem.""" + + label: typing.Literal[DocItemLabel.FORM_ITEM] = DocItemLabel.FORM_ITEM + + marker: Optional[TextItem] + + key: TextItem + value: TextItem + class FormItem(FloatingItem): """FormItem.""" label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM - graph: GraphData + def add(self, item: Union["FormItem", FormHeaderItem, FormTextItem, FormListItem]): + return + + def add_form(self, item: "FormItem") -> NodeItem: + item.parent = self.cref + self.children.append(item) + + return item + + def add_form_item(self, item: FormItem): + item.parent = self.cref + self.children.append(item) + + return item + + def add_form_text(self, item: FormTextItem): + item.parent = self.cref + self.children.append(item) + + return item + def add_form_header(self, item: FormHeaderItem): + item.parent = self.cref + self.children.append(item) + + return item + ContentItem = Annotated[ Union[ @@ -2987,7 +3032,7 @@ def add_key_values( def add_form( self, - graph: GraphData, + form: Optional[FormItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, ): @@ -3003,11 +3048,12 @@ def add_form( form_index = len(self.form_items) cref = f"#/form_items/{form_index}" - form_item = FormItem( - graph=graph, - self_ref=cref, - parent=parent.get_ref(), - ) + if form is None: + form = FormItem( + self_ref=cref, + parent=parent.get_ref(), + ) + if prov: form_item.prov.append(prov) diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index e5884bcb..17f6d1cf 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -31,6 +31,10 @@ class DocItemLabel(str, Enum): HANDWRITTEN_TEXT = "handwritten_text" EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms + FORM_HEADER = "form_header" + FORM_ITEM = "form_item" + FORM_TEXT = "form_text" + # Additional labels for markup-based formats (e.g. HTML, Word) PARAGRAPH = "paragraph" REFERENCE = "reference" From 566a0dcd29653aee22104dc05c1906a322e1b2a4 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 22 Sep 2025 11:34:15 +0200 Subject: [PATCH 2/2] working on Forms Signed-off-by: Peter Staar --- docling_core/types/doc/document.py | 58 +++++++++---------- docling_core/types/doc/labels.py | 7 ++- test/data/docling_document/unit/FormItem.yaml | 2 +- test/test_docling_doc.py | 38 +++++++++++- 4 files changed, 69 insertions(+), 36 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 447278e6..29d64c5a 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1188,6 +1188,7 @@ class TextItem(DocItem): DocItemLabel.REFERENCE, DocItemLabel.TEXT, DocItemLabel.EMPTY_VALUE, + DocItemLabel.FORM_KEY, ] orig: str # untreated representation @@ -1920,59 +1921,55 @@ def export_to_document_tokens( text = serializer.serialize(item=self).text return text +class CheckboxItem(ListItem): + """FormTextItem.""" + + label: typing.Literal[DocItemLabel.CHECKBOX] = DocItemLabel.CHECKBOX + + checked: bool = False +""" class FormHeaderItem(SectionHeaderItem): - """FormHeaderItem.""" label: typing.Literal[DocItemLabel.FORM_HEADER] = DocItemLabel.FORM_HEADER class FormTextItem(TextItem): - """FormTextItem.""" label: typing.Literal[DocItemLabel.FORM_TEXT] = DocItemLabel.FORM_TEXT - -class FormListItem(TextItem): +""" + +class FormListItem(DocItem): """FormListItem.""" - label: typing.Literal[DocItemLabel.FORM_ITEM] = DocItemLabel.FORM_ITEM + label: typing.Literal[DocItemLabel.FORM_LISTITEM] = DocItemLabel.FORM_LISTITEM - marker: Optional[TextItem] + marker: Optional[TextItem] = None key: TextItem - value: TextItem - -class FormItem(FloatingItem): - """FormItem.""" - - label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM - - def add(self, item: Union["FormItem", FormHeaderItem, FormTextItem, FormListItem]): - return - def add_form(self, item: "FormItem") -> NodeItem: - item.parent = self.cref + def add_value(self, item: Union[CheckboxItem, ListItem, TextItem]) -> NodeItem: + item.parent = self.get_ref() self.children.append(item) - + return item - def add_form_item(self, item: FormItem): - item.parent = self.cref - self.children.append(item) - return item + +class FormItem(FloatingItem): + """FormItem.""" - def add_form_text(self, item: FormTextItem): - item.parent = self.cref - self.children.append(item) + label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM + def add(self, item: Union["FormItem", SectionHeaderItem, TextItem, FormListItem]) -> NodeItem: + item.parent = self.get_ref() + self.children.append(item.get_ref()) + return item - def add_form_header(self, item: FormHeaderItem): - item.parent = self.cref - self.children.append(item) - + def add_listitem(self, doc: DoclingDocument, prov: Optional[ProvenanceItem] = None) -> NodeItem: + li = FormListItem(self_ref=self.get_ref()) return item - + ContentItem = Annotated[ Union[ @@ -1985,6 +1982,7 @@ def add_form_header(self, item: FormHeaderItem): PictureItem, TableItem, KeyValueItem, + FormItem, ], Field(discriminator="label"), ] diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index 17f6d1cf..69454895 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -31,9 +31,10 @@ class DocItemLabel(str, Enum): HANDWRITTEN_TEXT = "handwritten_text" EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms - FORM_HEADER = "form_header" - FORM_ITEM = "form_item" - FORM_TEXT = "form_text" + # FORM_HEADER = "form_header" + FORM_KEY = "form_key" + FORM_LISTITEM = "form_listitem" + CHECKBOX = "checkbox" # Additional labels for markup-based formats (e.g. HTML, Word) PARAGRAPH = "paragraph" diff --git a/test/data/docling_document/unit/FormItem.yaml b/test/data/docling_document/unit/FormItem.yaml index af7a61e1..219e951e 100644 --- a/test/data/docling_document/unit/FormItem.yaml +++ b/test/data/docling_document/unit/FormItem.yaml @@ -24,7 +24,7 @@ graph: source_cell_id: 1 target_cell_id: 0 image: null -label: form +label: key_value_region parent: null prov: [] references: [] diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index 50c263a9..de71fb1c 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -14,6 +14,7 @@ from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size from docling_core.types.doc.document import ( # BoundingBox, CURRENT_VERSION, + CheckboxItem, CodeItem, ContentLayer, DocItem, @@ -42,6 +43,8 @@ TableItem, TextItem, TitleItem, + CheckboxItem, + FormListItem, ) from docling_core.types.doc.labels import ( DocItemLabel, @@ -491,6 +494,7 @@ def verify(dc, obj): elif dc is FormItem: + """ graph = GraphData( cells=[ GraphCell( @@ -524,7 +528,31 @@ def verify(dc, obj): self_ref="#", ) verify(dc, obj) + """ + + key_name = TextItem(text="name", orig="name", self_ref="#", label=DocItemLabel.FORM_KEY) + val_name = TextItem(text="John Doe", orig="name", self_ref="#", label=DocItemLabel.TEXT) + + form_item_name = FormListItem(key=key_name, self_ref="#") + form_item_name.add_value(val_name) + + key_age = TextItem(text="Age", orig="Age", self_ref="#", label=DocItemLabel.FORM_KEY) + + cb_age_0 = CheckboxItem(checked=True, text="0-20", orig="0-20", self_ref="#") + cb_age_1 = CheckboxItem(checked=False, text="20-40", orig="20-40", self_ref="#") + val_age = TextItem(text="other", orig="other", self_ref="#", label=DocItemLabel.TEXT) + + form_item_age = FormListItem(key=key_age, self_ref="#") #, value=[cb_age_0, cb_age_1, val_age]) + for _ in [cb_age_0, cb_age_1, val_age]: + form_item_age.add_value(_) + + form = FormItem(self_ref="#") + + form.add(form_item_name) + form.add(form_item_age) + verify(dc, obj) + elif dc is TitleItem: obj = dc( text="whatever", @@ -571,8 +599,12 @@ def verify(dc, obj): text="E=mc^2", ) verify(dc, obj) - elif dc is GraphData: # we skip this on purpose + elif dc is CheckboxItem: # we skip this on purpose + continue + elif dc is FormListItem: # we skip this on purpose continue + elif dc is GraphData: # we skip this on purpose + continue else: raise RuntimeError(f"New derived class detected {dc.__name__}") @@ -1002,8 +1034,10 @@ def _construct_doc() -> DoclingDocument: doc.add_key_values(graph=graph) - doc.add_form(graph=graph) + form_1 = doc.add_form(graph=graph) + form_1_item_1 = form_1.add_listitem(key="Name") + inline_fmt = doc.add_inline_group() doc.add_text( label=DocItemLabel.TEXT, text="Some formatting chops:", parent=inline_fmt