From e0209bb23b7d0ee3a62c9840352bee77c8978441 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 20 May 2025 06:18:10 -0400 Subject: [PATCH 1/2] adding _validate_annotations_consistency function Signed-off-by: Peter Staar --- .../dataset_builders/cvat_dataset_builder.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/docling_eval/dataset_builders/cvat_dataset_builder.py b/docling_eval/dataset_builders/cvat_dataset_builder.py index 0ca8d51c..1e5abe03 100644 --- a/docling_eval/dataset_builders/cvat_dataset_builder.py +++ b/docling_eval/dataset_builders/cvat_dataset_builder.py @@ -379,7 +379,84 @@ def parse_annotation(self, image_annot: Dict) -> Tuple: merges, group, ) + + def _validate_annotations_consistency(self, + boxes, + lines, + reading_order, + to_captions, + to_footnotes, + to_values, + merges, + group) -> bool: + """Validate annotations consistency.""" + + def _validate_to_captions(): + """validate to captions. + + - Make sure the first bbox is a FloatingItem and the subsequent + are of type caption. + - Make sure that the to_caption only starts at the beginning of + a group, if a group is present. + """ + return True + + def _validate_to_footnotes(): + """validate to footnotes. + + - Make sure the first bbox is a FloatingItem and the subsequent + are of type footnote. + - Make sure that the to_caption only starts at the beginning of + a group, if a group is present. + """ + return True + + def _validate_captions(): + """validate captions. + + Make sure each caption either is the start of a to_caption or + has a merge node. + """ + return True + + def _validate_list_items(): + """validate list items. + + Make sure every list-item has a group or merge point. + """ + return True + + def _validate_reading_order(): + """validate reading-order. + Make sure that every bbox has a reading-order point or is + entirely overlapped with a bbox that has a reading-order. + """ + return True + + def _validate_merges(): + """validate merges. + + Make sure that all bbox in a merge are of same type + """ + return True + + def _validate_group(): + """validate groups. + + Make sure that all bbox in a group are of same type + """ + return True + + + return ( + _validate_to_captions() and + _validate_to_footnotes() and + _validate_reading_order() and + _validate_merges() and + _validate_group() and + __validate_list_items() + def create_prov( self, box: Dict, @@ -895,6 +972,17 @@ def create_true_document( _log.error(f"Incorrect annotation for {basename}: no reading order found") return None + if self._validate_annotations_consistency(boxes=boxes, + lines=lines, + reading_order=reading_order, + to_captions=to_captions, + to_footnotes=to_footnotes, + to_values=to_values, + merges=merges, + group=group): + _log.error(f"Inconsistent annotation for {basename}: skipping") + return None + # Original Groundtruth and Prediction files orig_file = desc.document_file From 0ac2fe066ae2fdf06d404ddb74d39c2268fb5bed Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 21 May 2025 07:08:49 -0400 Subject: [PATCH 2/2] implemented some of the consistency rules Signed-off-by: Peter Staar --- .../dataset_builders/cvat_dataset_builder.py | 112 +++++++++++++++++- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/docling_eval/dataset_builders/cvat_dataset_builder.py b/docling_eval/dataset_builders/cvat_dataset_builder.py index 1e5abe03..d7d4d08c 100644 --- a/docling_eval/dataset_builders/cvat_dataset_builder.py +++ b/docling_eval/dataset_builders/cvat_dataset_builder.py @@ -388,7 +388,7 @@ def _validate_annotations_consistency(self, to_footnotes, to_values, merges, - group) -> bool: + groups) -> bool: """Validate annotations consistency.""" def _validate_to_captions(): @@ -399,16 +399,68 @@ def _validate_to_captions(): - Make sure that the to_caption only starts at the beginning of a group, if a group is present. """ + for to_caption in to_captions: + + boxids = to_caption["boxids"] + if len(boxids)!=2: + _log.error(f"to_caption is longer than 2: {len(to_caption['boxids'])}") + return False + + box_source = boxes[boxids[0]] + box_target = boxes[boxids[1]] + + label_source = DocItemLabel(box_source["@label"]) + label_target = DocItemLabel(box_target["@label"]) + + if label_source not in [ + DocItemLabel.PICTURE, + DocItemLabel.TABLE, + DocItemLabel.CODE, + DocItemLabel.FORM, + ]: + _log.error(f"to_caption label-source: {label_source}") + return False + + if label_target!=DocItemLabel.CAPTION: + _log.error(f"to_caption label-target: {label_target}") + return False + return True - + def _validate_to_footnotes(): """validate to footnotes. - Make sure the first bbox is a FloatingItem and the subsequent are of type footnote. - - Make sure that the to_caption only starts at the beginning of + - Make sure that the to_footnote only starts at the beginning of a group, if a group is present. """ + for to_footnote in to_footnotes: + + boxids = to_footnote["boxids"] + if len(boxids)!=2: + _log.error(f"to_footnote is longer than 2: {len(to_footnote['boxids'])}") + return False + + box_source = boxes[boxids[0]] + box_target = boxes[boxids[1]] + + label_source = DocItemLabel(box_source["@label"]) + label_target = DocItemLabel(box_target["@label"]) + + if label_source not in [ + DocItemLabel.PICTURE, + DocItemLabel.TABLE, + DocItemLabel.CODE, + DocItemLabel.FORM, + ]: + _log.error(f"to_footnote label-source: {label_source}") + return False + + if label_target!=DocItemLabel.FOOTNOTE: + _log.error(f"to_footnote label-target: {label_target}") + return False + return True def _validate_captions(): @@ -417,6 +469,31 @@ def _validate_captions(): Make sure each caption either is the start of a to_caption or has a merge node. """ + caption_ids = [] + for to_caption in to_captions: + boxids = to_caption["boxids"] + if len(boxids)!=2: + _log.error(f"to_caption is longer than 2: {len(to_caption['boxids'])}") + return False + + box_target = boxes[boxids[1]] + label_target = DocItemLabel(box_target["@label"]) + + if label_target==DocItemLabel.CAPTION: + caption_ids.append(boxids[1]) + + for i,box in enumerate(boxes): + label = DocItemLabel(box_source["@label"]) + + caption_is_linked = False + if label == DocItemLabel.CAPTION: + if i not in caption_ids: + caption_is_linked = True + + if not caption_is_linked: + _log.error(f"caption {box} is not linked to anything") + return False + return True def _validate_list_items(): @@ -424,6 +501,25 @@ def _validate_list_items(): Make sure every list-item has a group or merge point. """ + for i,box in enumerate(boxes): + label = DocItemLabel(box_source["@label"]) + + listitem_is_grouped = True + if label == DocItemLabel.LIST_ITEM: + for group in groups: + if i in group["boxids"]: + listitem_is_grouped = True + + listitem_is_merged = True + if label == DocItemLabel.LIST_ITEM: + for merge in merges: + if i in merge["boxids"]: + listitem_is_merged = True + + if (not listitem_is_linked) or (not listitem_is_merged): + _log.error(f"listitem {box} is not grouped or merged.") + return False + return True def _validate_reading_order(): @@ -447,6 +543,13 @@ def _validate_group(): Make sure that all bbox in a group are of same type """ return True + + def _validate_to_values_links(): + """validate to_value links. + + Make sure that all to_value links are between + """ + return True return ( @@ -456,7 +559,8 @@ def _validate_group(): _validate_merges() and _validate_group() and __validate_list_items() - + ) + def create_prov( self, box: Dict,