From f19d77ce83bae59e77cd62eb5a1fb1b8a0ecf078 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 6 May 2025 13:04:46 +0200 Subject: [PATCH 1/4] Make TableCell a NodeItem, add supporting methods and test case. Signed-off-by: Christoph Auer --- .flake8 | 2 +- docling_core/transforms/serializer/html.py | 6 +- docling_core/types/base.py | 2 +- docling_core/types/doc/document.py | 411 +++++++++++++----- docs/DoclingDocument.json | 66 ++- .../constructed_doc.appended_child.json.gt | 50 ++- .../doc/constructed_doc.deleted_group.json.gt | 50 ++- .../constructed_doc.deleted_picture.json.gt | 50 ++- .../doc/constructed_doc.deleted_text.json.gt | 50 ++- .../data/doc/constructed_doc.embedded.json.gt | 50 ++- .../data/doc/constructed_doc.embedded.yaml.gt | 82 +++- .../doc/constructed_doc.inserted_text.json.gt | 50 ++- .../doc/constructed_doc.referenced.json.gt | 50 ++- .../doc/constructed_doc.referenced.yaml.gt | 82 +++- .../doc/constructed_doc.replaced_item.json.gt | 50 ++- .../data/docling_document/unit/TableItem.yaml | 64 ++- test/test_otsl_table_export.py | 42 +- test/test_rich_tables.py | 84 ++++ 18 files changed, 1058 insertions(+), 183 deletions(-) create mode 100644 test/test_rich_tables.py diff --git a/.flake8 b/.flake8 index 22a8f95f..3dfbb004 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,6 @@ [flake8] per-file-ignores = __init__.py:F401 -max-line-length = 88 +max-line-length = 120 exclude = test/* max-complexity = 25 docstring-convention = google diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index d53cc707..8ad8cbbd 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -329,7 +329,11 @@ def serialize( if colstart != j: continue - content = html.escape(cell.text.strip()) + if cell.has_rich_content(): + # TODO: Do something that serializes the cell and its children + content = html.escape(cell.text.strip()) + else: + content = html.escape(cell.text.strip()) celltag = "td" if cell.column_header: celltag = "th" diff --git a/docling_core/types/base.py b/docling_core/types/base.py index 4676e55b..f0429570 100644 --- a/docling_core/types/base.py +++ b/docling_core/types/base.py @@ -29,7 +29,7 @@ from docling_core.utils.validators import validate_datetime, validate_unique_list # (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84": -_JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$" +_JSON_POINTER_REGEX: Final[str] = r"^(?:#|\d+(?:#)?)(?:/(?:[\w-]+|\d+))*$" LanguageT = TypeVar("LanguageT", bound=str) IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 2cf48cf0..45d6e911 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -52,7 +52,7 @@ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))] LevelNumber = typing.Annotated[int, Field(ge=1, le=100)] -CURRENT_VERSION: Final = "1.3.0" +CURRENT_VERSION: Final = "1.4.0" DEFAULT_EXPORT_LABELS = { DocItemLabel.TITLE, @@ -281,91 +281,6 @@ class PictureScatterChartData(PictureChartData): points: List[ChartPoint] -class TableCell(BaseModel): - """TableCell.""" - - bbox: Optional[BoundingBox] = None - row_span: int = 1 - col_span: int = 1 - start_row_offset_idx: int - end_row_offset_idx: int - start_col_offset_idx: int - end_col_offset_idx: int - text: str - column_header: bool = False - row_header: bool = False - row_section: bool = False - - @model_validator(mode="before") - @classmethod - def from_dict_format(cls, data: Any) -> Any: - """from_dict_format.""" - if isinstance(data, dict): - # Check if this is a native BoundingBox or a bbox from docling-ibm-models - if ( - # "bbox" not in data - # or data["bbox"] is None - # or isinstance(data["bbox"], BoundingBox) - "text" - in data - ): - return data - text = data["bbox"].get("token", "") - if not len(text): - text_cells = data.pop("text_cell_bboxes", None) - if text_cells: - for el in text_cells: - text += el["token"] + " " - - text = text.strip() - data["text"] = text - - return data - - -class TableData(BaseModel): # TBD - """BaseTableData.""" - - table_cells: List[TableCell] = [] - num_rows: int = 0 - num_cols: int = 0 - - @computed_field # type: ignore - @property - def grid( - self, - ) -> List[List[TableCell]]: - """grid.""" - # Initialise empty table data grid (only empty cells) - table_data = [ - [ - TableCell( - text="", - start_row_offset_idx=i, - end_row_offset_idx=i + 1, - start_col_offset_idx=j, - end_col_offset_idx=j + 1, - ) - for j in range(self.num_cols) - ] - for i in range(self.num_rows) - ] - - # Overwrite cells in table data for which there is actual cell content. - for cell in self.table_cells: - for i in range( - min(cell.start_row_offset_idx, self.num_rows), - min(cell.end_row_offset_idx, self.num_rows), - ): - for j in range( - min(cell.start_col_offset_idx, self.num_cols), - min(cell.end_col_offset_idx, self.num_cols), - ): - table_data[i][j] = cell - - return table_data - - class PictureTabularChartData(PictureChartData): """Base class for picture chart data. @@ -375,7 +290,7 @@ class PictureTabularChartData(PictureChartData): """ kind: Literal["tabular_chart_data"] = "tabular_chart_data" - chart_data: TableData + chart_data: "TableData" PictureDataType = Annotated[ @@ -468,17 +383,40 @@ def _split_ref_to_path(self): def resolve(self, doc: "DoclingDocument"): """Resolve the path in the document.""" + # Split the path and handle the '#' at the beginning path_components = self.cref.split("/") - if (num_comps := len(path_components)) == 3: - _, path, index_str = path_components - index = int(index_str) - obj = doc.__getattribute__(path)[index] - elif num_comps == 2: - _, path = path_components - obj = doc.__getattribute__(path) - else: - raise RuntimeError(f"Unsupported number of path components: {num_comps}") - return obj + start_idx = 1 if path_components[0] in ["#", ""] else 0 + + # Start with the document as our current object + current = doc + + # Navigate through each path component + for i in range(start_idx, len(path_components)): + component = path_components[i] + + if not component: # Skip empty components + continue + + try: + if component.isdigit(): + # Handle array indices + assert isinstance(current, typing.Sequence) + current = current[int(component)] + else: + # Try attribute access first (Pydantic v2 way) + try: + current = current.__getattribute__(component) + except AttributeError: + # Fall back to item access for dict-like objects + assert isinstance(current, typing.Mapping) + current = current[component] + except (IndexError, KeyError, AttributeError, TypeError) as e: + # Only raise for invalid paths with specific message + raise ValueError( + f"Invalid path component '{component}' in '{self.cref}': {str(e)}" + ) + + return current class ImageRef(BaseModel): @@ -1178,6 +1116,98 @@ def export_to_doctags( return text +class TableCell(NodeItem): + """TableCell.""" + + bbox: Optional[BoundingBox] = None + row_span: int = 1 + col_span: int = 1 + start_row_offset_idx: int + end_row_offset_idx: int + start_col_offset_idx: int + end_col_offset_idx: int + text: str = "" # new default + column_header: bool = False + row_header: bool = False + row_section: bool = False + + self_ref: str = Field(pattern=_JSON_POINTER_REGEX, default="0") + # 0 is a valid relative JSON pointer, pointing simply to itself. + # The absolute JSON pointer must be assigned when TableCell becomes + # part of a DoclingDocument table. + + def has_rich_content(self): + """Checks if the table has child elements in the document hierarchy. + + Returns: + bool: True if the table has child elements, False otherwise. + """ + return len(self.children) > 0 + + @model_validator(mode="before") + @classmethod + def from_dict_format(cls, data: Any) -> Any: + """from_dict_format.""" + if isinstance(data, dict): + # Check if this is a native BoundingBox or a bbox from docling-ibm-models + if "text" in data: + return data + text = data["bbox"].get("token", "") + if not len(text): + text_cells = data.pop("text_cell_bboxes", None) + if text_cells: + for el in text_cells: + text += el["token"] + " " + + text = text.strip() + data["text"] = text + + return data + + +class TableData(BaseModel): # TBD + """BaseTableData.""" + + table_cells: List[TableCell] = [] + num_rows: int = 0 + num_cols: int = 0 + + @computed_field # type: ignore + @property + def grid( + self, + ) -> List[List[TableCell]]: + """grid.""" + # Initialise empty table data grid (only empty cells) + table_data = [ + [ + TableCell( + text="", + start_row_offset_idx=i, + end_row_offset_idx=i + 1, + start_col_offset_idx=j, + end_col_offset_idx=j + 1, + ) + for j in range(self.num_cols) + ] + for i in range(self.num_rows) + ] + + # Overwrite cells in table data for which there is actual cell content. + for cell in self.table_cells: + for i in range( + min(cell.start_row_offset_idx, self.num_rows), + min(cell.end_row_offset_idx, self.num_rows), + ): + for j in range( + min(cell.start_col_offset_idx, self.num_cols), + min(cell.end_col_offset_idx, self.num_cols), + ): + table_data[i][j] = cell + + return table_data + + class TableItem(FloatingItem): """TableItem.""" @@ -1187,6 +1217,193 @@ class TableItem(FloatingItem): DocItemLabel.TABLE, ] = DocItemLabel.TABLE + def update_cell( + self, + start_row_offset_idx: int, + end_row_offset_idx: int, + start_col_offset_idx: int, + end_col_offset_idx: int, + column_header: bool = False, + row_header: bool = False, + row_section: bool = False, + bbox: Optional[BoundingBox] = None, + text: Optional[str] = None, + ) -> TableCell: + """Update a cell in the table with the specified parameters. + + If a cell with the exact same row and column offsets exists, it will be updated. + Otherwise, a new cell will be created. + + Args: + start_row_offset_idx: The starting row index. + end_row_offset_idx: The ending row index (exclusive). + start_col_offset_idx: The starting column index. + end_col_offset_idx: The ending column index (exclusive). + column_header: Whether the cell is a column header. + row_header: Whether the cell is a row header. + row_section: Whether the cell is a row section. + bbox: The bounding box of the cell. + text: The text content of the cell. + + Raises: + ValueError: If the indices exceed the table dimensions or if the cell overlaps + with existing cells. + """ + # Check if indices exceed table dimensions + if start_row_offset_idx < 0 or end_row_offset_idx > self.data.num_rows: + raise ValueError( + f"Row indices out of bounds: {start_row_offset_idx} to {end_row_offset_idx} " + f"(table has {self.data.num_rows} rows)" + ) + + if start_col_offset_idx < 0 or end_col_offset_idx > self.data.num_cols: + raise ValueError( + f"Column indices out of bounds: {start_col_offset_idx} to {end_col_offset_idx} " + f"(table has {self.data.num_cols} columns)" + ) + + # Calculate row_span and col_span + row_span = end_row_offset_idx - start_row_offset_idx + col_span = end_col_offset_idx - start_col_offset_idx + + if row_span <= 0 or col_span <= 0: + raise ValueError(f"Invalid span: row_span={row_span}, col_span={col_span}") + + # Check if a cell with these exact coordinates already exists + existing_cell_index = None + for i, cell in enumerate(self.data.table_cells): + if ( + cell.start_row_offset_idx == start_row_offset_idx + and cell.end_row_offset_idx == end_row_offset_idx + and cell.start_col_offset_idx == start_col_offset_idx + and cell.end_col_offset_idx == end_col_offset_idx + ): + existing_cell_index = i + break + + # Check for overlaps with existing cells (excluding the cell we're updating) + for i, cell in enumerate(self.data.table_cells): + if i == existing_cell_index: + continue + + # Check if the new cell overlaps with any existing cell + # Note: end indices are exclusive, so cells touching at boundaries don't overlap + row_overlap = ( + start_row_offset_idx < cell.end_row_offset_idx + and end_row_offset_idx > cell.start_row_offset_idx + ) + col_overlap = ( + start_col_offset_idx < cell.end_col_offset_idx + and end_col_offset_idx > cell.start_col_offset_idx + ) + + if row_overlap and col_overlap: + raise ValueError(f"New cell overlaps with existing cell: {cell.text}") + + # Create a new cell + cell_text = text if text is not None else "" + new_cell = TableCell( + text=cell_text, + row_span=row_span, + col_span=col_span, + start_row_offset_idx=start_row_offset_idx, + end_row_offset_idx=end_row_offset_idx, + start_col_offset_idx=start_col_offset_idx, + end_col_offset_idx=end_col_offset_idx, + column_header=column_header, + row_header=row_header, + row_section=row_section, + bbox=bbox, + ) + + # Update or add the cell + if existing_cell_index is not None: + self.data.table_cells[existing_cell_index] = new_cell + new_cell.self_ref = ( + f"{self.self_ref}/data/table_cells/{existing_cell_index}" + ) + else: + self.data.table_cells.append(new_cell) + new_cell.self_ref = ( + f"{self.self_ref}/data/table_cells/{len(self.data.table_cells) - 1}" + ) + + return new_cell + + def delete_cells( + self, + start_row_offset_idx: int, + end_row_offset_idx: int, + start_col_offset_idx: int, + end_col_offset_idx: int, + strictly_contained: bool = False, + ) -> int: + """Delete cells that interact with the specified range. + + Args: + start_row_offset_idx: The starting row index. + end_row_offset_idx: The ending row index (exclusive). + start_col_offset_idx: The starting column index. + end_col_offset_idx: The ending column index (exclusive). + strictly_contained: If True, only delete cells that are entirely within the range. + If False, delete any cell that overlaps with the range. + + Returns: + The number of cells deleted. + + Raises: + ValueError: If the indices exceed the table dimensions. + """ + # Check if indices exceed table dimensions + if start_row_offset_idx < 0 or end_row_offset_idx > self.data.num_rows: + raise ValueError( + f"Row indices out of bounds: {start_row_offset_idx} to {end_row_offset_idx} " + f"(table has {self.data.num_rows} rows)" + ) + + if start_col_offset_idx < 0 or end_col_offset_idx > self.data.num_cols: + raise ValueError( + f"Column indices out of bounds: {start_col_offset_idx} to {end_col_offset_idx} " + f"(table has {self.data.num_cols} columns)" + ) + + if ( + start_row_offset_idx >= end_row_offset_idx + or start_col_offset_idx >= end_col_offset_idx + ): + raise ValueError( + f"Invalid range: rows={start_row_offset_idx}:{end_row_offset_idx}, " + f"cols={start_col_offset_idx}:{end_col_offset_idx}" + ) + + # Find cells that interact with the specified range + cells_to_delete = [] + for i, cell in enumerate(self.data.table_cells): + if strictly_contained: + # Check if the cell is entirely within the specified range + if ( + cell.start_row_offset_idx >= start_row_offset_idx + and cell.end_row_offset_idx <= end_row_offset_idx + and cell.start_col_offset_idx >= start_col_offset_idx + and cell.end_col_offset_idx <= end_col_offset_idx + ): + cells_to_delete.append(i) + else: + # Check if the cell overlaps with the specified range + if ( + cell.start_row_offset_idx < end_row_offset_idx + and cell.end_row_offset_idx > start_row_offset_idx + and cell.start_col_offset_idx < end_col_offset_idx + and cell.end_col_offset_idx > start_col_offset_idx + ): + cells_to_delete.append(i) + + # Delete the cells in reverse order to avoid index shifting issues + for index in sorted(cells_to_delete, reverse=True): + del self.data.table_cells[index] + + return len(cells_to_delete) + def export_to_dataframe(self) -> pd.DataFrame: """Export the table as a Pandas DataFrame.""" if self.data.num_rows == 0 or self.data.num_cols == 0: diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index a27365b8..0dfb5d28 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -167,7 +167,7 @@ "description": "CodeItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -420,7 +420,7 @@ "description": "FormItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -539,7 +539,7 @@ "description": "FormulaItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -748,7 +748,7 @@ "description": "GroupItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -852,7 +852,7 @@ "description": "InlineGroup.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -902,7 +902,7 @@ "description": "KeyValueItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -994,7 +994,7 @@ "description": "SectionItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -1096,7 +1096,7 @@ "description": "OrderedList.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -1285,7 +1285,7 @@ "description": "PictureItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -1701,7 +1701,7 @@ "description": "RefItem.", "properties": { "$ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "$Ref", "type": "string" } @@ -1717,7 +1717,7 @@ "description": "SectionItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -1829,8 +1829,38 @@ "type": "object" }, "TableCell": { + "additionalProperties": false, "description": "TableCell.", "properties": { + "self_ref": { + "default": "0", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", + "title": "Self Ref", + "type": "string" + }, + "parent": { + "anyOf": [ + { + "$ref": "#/$defs/RefItem" + }, + { + "type": "null" + } + ], + "default": null + }, + "children": { + "default": [], + "items": { + "$ref": "#/$defs/RefItem" + }, + "title": "Children", + "type": "array" + }, + "content_layer": { + "$ref": "#/$defs/ContentLayer", + "default": "body" + }, "bbox": { "anyOf": [ { @@ -1869,6 +1899,7 @@ "type": "integer" }, "text": { + "default": "", "title": "Text", "type": "string" }, @@ -1892,8 +1923,7 @@ "start_row_offset_idx", "end_row_offset_idx", "start_col_offset_idx", - "end_col_offset_idx", - "text" + "end_col_offset_idx" ], "title": "TableCell", "type": "object" @@ -1928,7 +1958,7 @@ "description": "TableItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -2023,7 +2053,7 @@ "description": "TextItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -2125,7 +2155,7 @@ "description": "TitleItem.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -2217,7 +2247,7 @@ "description": "UnorderedList.", "properties": { "self_ref": { - "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "pattern": "^(?:#|\\d+(?:#)?)(?:/(?:[\\w-]+|\\d+))*$", "title": "Self Ref", "type": "string" }, @@ -2272,7 +2302,7 @@ "type": "string" }, "version": { - "default": "1.3.0", + "default": "1.4.0", "pattern": "^(?P0|[1-9]\\d*)\\.(?P0|[1-9]\\d*)\\.(?P0|[1-9]\\d*)(?:-(?P(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+(?P[0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$", "title": "Version", "type": "string" diff --git a/test/data/doc/constructed_doc.appended_child.json.gt b/test/data/doc/constructed_doc.appended_child.json.gt index 1fac318f..57a834b4 100644 --- a/test/data/doc/constructed_doc.appended_child.json.gt +++ b/test/data/doc/constructed_doc.appended_child.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", @@ -963,6 +963,9 @@ "data": { "table_cells": [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -975,6 +978,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -987,6 +993,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -999,6 +1008,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1011,6 +1023,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1023,6 +1038,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1035,6 +1053,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1052,6 +1073,9 @@ "grid": [ [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1064,6 +1088,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1076,6 +1103,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1090,6 +1120,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1102,6 +1135,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1114,6 +1150,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1128,6 +1167,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1140,6 +1182,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1152,6 +1197,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, diff --git a/test/data/doc/constructed_doc.deleted_group.json.gt b/test/data/doc/constructed_doc.deleted_group.json.gt index 55cb7002..83f55c1d 100644 --- a/test/data/doc/constructed_doc.deleted_group.json.gt +++ b/test/data/doc/constructed_doc.deleted_group.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", @@ -961,6 +961,9 @@ "data": { "table_cells": [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -973,6 +976,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -985,6 +991,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -997,6 +1006,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1009,6 +1021,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1021,6 +1036,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1033,6 +1051,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1050,6 +1071,9 @@ "grid": [ [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1062,6 +1086,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1074,6 +1101,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1088,6 +1118,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1100,6 +1133,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1112,6 +1148,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1126,6 +1165,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1138,6 +1180,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1150,6 +1195,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, diff --git a/test/data/doc/constructed_doc.deleted_picture.json.gt b/test/data/doc/constructed_doc.deleted_picture.json.gt index f388c10b..4c9611f1 100644 --- a/test/data/doc/constructed_doc.deleted_picture.json.gt +++ b/test/data/doc/constructed_doc.deleted_picture.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", @@ -931,6 +931,9 @@ "data": { "table_cells": [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -943,6 +946,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -955,6 +961,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -967,6 +976,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -979,6 +991,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -991,6 +1006,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1003,6 +1021,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1020,6 +1041,9 @@ "grid": [ [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1032,6 +1056,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1044,6 +1071,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1058,6 +1088,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1070,6 +1103,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1082,6 +1118,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1096,6 +1135,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1108,6 +1150,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1120,6 +1165,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, diff --git a/test/data/doc/constructed_doc.deleted_text.json.gt b/test/data/doc/constructed_doc.deleted_text.json.gt index 1757c26b..b65eb75c 100644 --- a/test/data/doc/constructed_doc.deleted_text.json.gt +++ b/test/data/doc/constructed_doc.deleted_text.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", @@ -1203,6 +1203,9 @@ "data": { "table_cells": [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1215,6 +1218,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1227,6 +1233,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1239,6 +1248,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1251,6 +1263,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1263,6 +1278,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1275,6 +1293,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1292,6 +1313,9 @@ "grid": [ [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1304,6 +1328,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1316,6 +1343,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1330,6 +1360,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1342,6 +1375,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1354,6 +1390,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1368,6 +1407,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1380,6 +1422,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1392,6 +1437,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, diff --git a/test/data/doc/constructed_doc.embedded.json.gt b/test/data/doc/constructed_doc.embedded.json.gt index 148619f5..f972628c 100644 --- a/test/data/doc/constructed_doc.embedded.json.gt +++ b/test/data/doc/constructed_doc.embedded.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", @@ -1186,6 +1186,9 @@ "data": { "table_cells": [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1198,6 +1201,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1210,6 +1216,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1222,6 +1231,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1234,6 +1246,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1246,6 +1261,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1258,6 +1276,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1275,6 +1296,9 @@ "grid": [ [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1287,6 +1311,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1299,6 +1326,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1313,6 +1343,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1325,6 +1358,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1337,6 +1373,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1351,6 +1390,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1363,6 +1405,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1375,6 +1420,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, diff --git a/test/data/doc/constructed_doc.embedded.yaml.gt b/test/data/doc/constructed_doc.embedded.yaml.gt index 0820452f..aa177a00 100644 --- a/test/data/doc/constructed_doc.embedded.yaml.gt +++ b/test/data/doc/constructed_doc.embedded.yaml.gt @@ -286,166 +286,214 @@ tables: content_layer: body data: grid: - - - col_span: 1 + - - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 2 row_header: false row_section: false row_span: 2 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 0 text: Product - - col_span: 2 + - children: [] + col_span: 2 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 1 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 0 text: Years - - col_span: 2 + - children: [] + col_span: 2 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 1 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 0 text: Years - - - col_span: 1 + - - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 2 row_header: false row_section: false row_span: 2 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 0 text: Product - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 2 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 1 text: '2016' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 2 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 1 text: '2017' - - - col_span: 1 + - - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 2 text: Apple - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 2 text: '49823' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 2 text: '695944' num_cols: 3 num_rows: 3 table_cells: - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 2 row_header: false row_section: false row_span: 2 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 0 text: Product - - col_span: 2 + - children: [] + col_span: 2 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 1 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 0 text: Years - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 2 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 1 text: '2016' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 2 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 1 text: '2017' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 2 text: Apple - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 2 text: '49823' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 2 text: '695944' @@ -1015,4 +1063,4 @@ texts: prov: [] self_ref: '#/texts/50' text: The end. -version: 1.3.0 +version: 1.4.0 diff --git a/test/data/doc/constructed_doc.inserted_text.json.gt b/test/data/doc/constructed_doc.inserted_text.json.gt index 016e79e6..b43a98a8 100644 --- a/test/data/doc/constructed_doc.inserted_text.json.gt +++ b/test/data/doc/constructed_doc.inserted_text.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", @@ -1220,6 +1220,9 @@ "data": { "table_cells": [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1232,6 +1235,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1244,6 +1250,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1256,6 +1265,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1268,6 +1280,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1280,6 +1295,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1292,6 +1310,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1309,6 +1330,9 @@ "grid": [ [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1321,6 +1345,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1333,6 +1360,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1347,6 +1377,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1359,6 +1392,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1371,6 +1407,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1385,6 +1424,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1397,6 +1439,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1409,6 +1454,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, diff --git a/test/data/doc/constructed_doc.referenced.json.gt b/test/data/doc/constructed_doc.referenced.json.gt index 4a946570..ede16453 100644 --- a/test/data/doc/constructed_doc.referenced.json.gt +++ b/test/data/doc/constructed_doc.referenced.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", @@ -1186,6 +1186,9 @@ "data": { "table_cells": [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1198,6 +1201,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1210,6 +1216,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1222,6 +1231,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1234,6 +1246,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1246,6 +1261,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1258,6 +1276,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1275,6 +1296,9 @@ "grid": [ [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1287,6 +1311,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1299,6 +1326,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1313,6 +1343,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1325,6 +1358,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1337,6 +1373,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1351,6 +1390,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1363,6 +1405,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1375,6 +1420,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, diff --git a/test/data/doc/constructed_doc.referenced.yaml.gt b/test/data/doc/constructed_doc.referenced.yaml.gt index 37815244..c1e664d8 100644 --- a/test/data/doc/constructed_doc.referenced.yaml.gt +++ b/test/data/doc/constructed_doc.referenced.yaml.gt @@ -286,166 +286,214 @@ tables: content_layer: body data: grid: - - - col_span: 1 + - - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 2 row_header: false row_section: false row_span: 2 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 0 text: Product - - col_span: 2 + - children: [] + col_span: 2 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 1 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 0 text: Years - - col_span: 2 + - children: [] + col_span: 2 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 1 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 0 text: Years - - - col_span: 1 + - - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 2 row_header: false row_section: false row_span: 2 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 0 text: Product - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 2 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 1 text: '2016' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 2 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 1 text: '2017' - - - col_span: 1 + - - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 2 text: Apple - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 2 text: '49823' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 2 text: '695944' num_cols: 3 num_rows: 3 table_cells: - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 2 row_header: false row_section: false row_span: 2 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 0 text: Product - - col_span: 2 + - children: [] + col_span: 2 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 1 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 0 text: Years - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 2 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 1 text: '2016' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 2 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 1 text: '2017' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 2 text: Apple - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 2 text: '49823' - - col_span: 1 + - children: [] + col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 3 row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 2 text: '695944' @@ -1015,4 +1063,4 @@ texts: prov: [] self_ref: '#/texts/50' text: The end. -version: 1.3.0 +version: 1.4.0 diff --git a/test/data/doc/constructed_doc.replaced_item.json.gt b/test/data/doc/constructed_doc.replaced_item.json.gt index 02736e83..84ff433e 100644 --- a/test/data/doc/constructed_doc.replaced_item.json.gt +++ b/test/data/doc/constructed_doc.replaced_item.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", @@ -963,6 +963,9 @@ "data": { "table_cells": [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -975,6 +978,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -987,6 +993,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -999,6 +1008,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1011,6 +1023,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1023,6 +1038,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1035,6 +1053,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1052,6 +1073,9 @@ "grid": [ [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1064,6 +1088,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1076,6 +1103,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 2, "start_row_offset_idx": 0, @@ -1090,6 +1120,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 2, "col_span": 1, "start_row_offset_idx": 0, @@ -1102,6 +1135,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1114,6 +1150,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 1, @@ -1128,6 +1167,9 @@ ], [ { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1140,6 +1182,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, @@ -1152,6 +1197,9 @@ "row_section": false }, { + "self_ref": "0", + "children": [], + "content_layer": "body", "row_span": 1, "col_span": 1, "start_row_offset_idx": 2, diff --git a/test/data/docling_document/unit/TableItem.yaml b/test/data/docling_document/unit/TableItem.yaml index 15d9a07f..09d1e505 100644 --- a/test/data/docling_document/unit/TableItem.yaml +++ b/test/data/docling_document/unit/TableItem.yaml @@ -1,169 +1,230 @@ captions: [] children: [] +content_layer: body data: grid: - - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 1 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 0 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 1 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 0 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 1 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 0 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 4 end_row_offset_idx: 1 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 3 start_row_offset_idx: 0 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 5 end_row_offset_idx: 1 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 4 start_row_offset_idx: 0 text: '' - - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 2 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 1 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 2 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 1 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 2 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 1 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 4 end_row_offset_idx: 2 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 3 start_row_offset_idx: 1 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 5 end_row_offset_idx: 2 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 4 start_row_offset_idx: 1 text: '' - - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 1 end_row_offset_idx: 3 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 0 start_row_offset_idx: 2 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 2 end_row_offset_idx: 3 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 1 start_row_offset_idx: 2 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 3 end_row_offset_idx: 3 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 2 start_row_offset_idx: 2 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 4 end_row_offset_idx: 3 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 3 start_row_offset_idx: 2 text: '' - bbox: null + children: [] col_span: 1 column_header: false + content_layer: body end_col_offset_idx: 5 end_row_offset_idx: 3 + parent: null row_header: false row_section: false row_span: 1 + self_ref: '0' start_col_offset_idx: 4 start_row_offset_idx: 2 text: '' @@ -176,5 +237,4 @@ label: table parent: null prov: [] references: [] -self_ref: '#' -content_layer: body \ No newline at end of file +self_ref: '#' \ No newline at end of file diff --git a/test/test_otsl_table_export.py b/test/test_otsl_table_export.py index 4b3534f3..3358565b 100644 --- a/test/test_otsl_table_export.py +++ b/test/test_otsl_table_export.py @@ -15,8 +15,8 @@ def test_table_export_to_otsl(): start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, - end_col_offset_idx=3, - col_header=False, + end_col_offset_idx=2, + column_header=False, row_header=True, ) ) @@ -30,7 +30,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=1, start_col_offset_idx=2, end_col_offset_idx=3, - col_header=False, + column_header=False, row_header=True, ) ) @@ -45,7 +45,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=2, start_col_offset_idx=0, end_col_offset_idx=1, - col_header=False, + column_header=False, row_header=True, ) ) @@ -59,7 +59,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=2, start_col_offset_idx=1, end_col_offset_idx=2, - col_header=False, + column_header=False, row_header=False, ) ) @@ -73,7 +73,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=2, start_col_offset_idx=2, end_col_offset_idx=3, - col_header=False, + column_header=False, row_header=False, ) ) @@ -88,7 +88,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=2, start_col_offset_idx=3, end_col_offset_idx=6, - col_header=True, + column_header=True, row_header=False, ) ) @@ -103,7 +103,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=4, start_col_offset_idx=0, end_col_offset_idx=1, - col_header=False, + column_header=False, row_header=True, ) ) @@ -117,7 +117,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=3, start_col_offset_idx=1, end_col_offset_idx=2, - col_header=False, + column_header=False, row_header=False, ) ) @@ -131,7 +131,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=3, start_col_offset_idx=2, end_col_offset_idx=3, - col_header=False, + column_header=False, row_header=False, ) ) @@ -145,7 +145,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=3, start_col_offset_idx=3, end_col_offset_idx=4, - col_header=False, + column_header=False, row_header=False, ) ) @@ -159,7 +159,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=3, start_col_offset_idx=4, end_col_offset_idx=5, - col_header=False, + column_header=False, row_header=False, ) ) @@ -173,7 +173,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=3, start_col_offset_idx=5, end_col_offset_idx=6, - col_header=False, + column_header=False, row_header=False, ) ) @@ -189,7 +189,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=4, start_col_offset_idx=1, end_col_offset_idx=2, - col_header=False, + column_header=False, row_header=False, ) ) @@ -203,7 +203,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=4, start_col_offset_idx=2, end_col_offset_idx=3, - col_header=False, + column_header=False, row_header=False, ) ) @@ -217,7 +217,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=4, start_col_offset_idx=3, end_col_offset_idx=4, - col_header=False, + column_header=False, row_header=False, ) ) @@ -231,7 +231,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=4, start_col_offset_idx=4, end_col_offset_idx=5, - col_header=False, + column_header=False, row_header=False, ) ) @@ -245,7 +245,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=4, start_col_offset_idx=5, end_col_offset_idx=6, - col_header=False, + column_header=False, row_header=False, ) ) @@ -260,7 +260,7 @@ def test_table_export_to_otsl(): end_row_offset_idx=5, start_col_offset_idx=0, end_col_offset_idx=6, - col_header=False, + column_header=False, row_header=False, row_section=True, ) @@ -269,7 +269,7 @@ def test_table_export_to_otsl(): # ====================================== doc = DoclingDocument(name="test_otsl") data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=data_table_cells) - doc.add_table(data=data) + tbl = doc.add_table(data=data) otsl_string = doc.tables[0].export_to_otsl( add_cell_location=False, add_cell_text=False, doc=doc @@ -280,5 +280,5 @@ def test_table_export_to_otsl(): print(s) assert ( otsl_string - == "" + == "" ) diff --git a/test/test_rich_tables.py b/test/test_rich_tables.py new file mode 100644 index 00000000..9f14bac8 --- /dev/null +++ b/test/test_rich_tables.py @@ -0,0 +1,84 @@ +import json + +from docling_core.types.doc import DocItemLabel +from docling_core.types.doc.document import DoclingDocument, RefItem, TableData + + +def test_construct_rich_table(): + num_cols = 3 + num_rows = 3 + + doc = DoclingDocument(name="test_rich_table") + data = TableData(num_rows=num_rows, num_cols=num_cols) + tbl = doc.add_table(data=data) + + first_cell = tbl.update_cell( + text="AB", + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=0, + end_col_offset_idx=2, + column_header=False, + row_header=True, + ) + + print(first_cell) + + second_cell = tbl.update_cell( + text="C", + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=2, + end_col_offset_idx=3, + column_header=False, + row_header=True, + ) + + print(second_cell) + + third_cell = tbl.update_cell( + text="1", + start_row_offset_idx=1, + end_row_offset_idx=3, + start_col_offset_idx=0, + end_col_offset_idx=1, + column_header=False, + row_header=True, + ) + + print(third_cell) + + fourth_cell = tbl.update_cell( + text="2", + start_row_offset_idx=1, + end_row_offset_idx=2, + start_col_offset_idx=1, + end_col_offset_idx=2, + column_header=False, + row_header=False, + ) + + print(fourth_cell) + + fifth_cell = tbl.update_cell( + text="3", + start_row_offset_idx=1, + end_row_offset_idx=2, + start_col_offset_idx=2, + end_col_offset_idx=3, + column_header=False, + row_header=False, + ) + + print(fifth_cell) + + doc.add_text(parent=fifth_cell, text="Foo", label=DocItemLabel.TEXT) + + # Test serialization + print(tbl.export_to_markdown(doc)) + print(tbl.export_to_html(doc)) + print(json.dumps(doc.export_to_dict(), indent=2)) + + # Test resolution from table cell ref: + resolved = RefItem(cref="#/tables/0/data/table_cells/2").resolve(doc) + print(resolved) From e474c238d8ed5714fa6458d2272ee940a4d0ea76 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 6 May 2025 13:09:26 +0200 Subject: [PATCH 2/4] Improve comments on test case Signed-off-by: Christoph Auer --- test/test_rich_tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_rich_tables.py b/test/test_rich_tables.py index 9f14bac8..ca55dca0 100644 --- a/test/test_rich_tables.py +++ b/test/test_rich_tables.py @@ -8,10 +8,12 @@ def test_construct_rich_table(): num_cols = 3 num_rows = 3 + # Construct a table by building data without cells first... doc = DoclingDocument(name="test_rich_table") data = TableData(num_rows=num_rows, num_cols=num_cols) tbl = doc.add_table(data=data) + # Then add cells with the update_cell method. first_cell = tbl.update_cell( text="AB", start_row_offset_idx=0, @@ -61,7 +63,6 @@ def test_construct_rich_table(): print(fourth_cell) fifth_cell = tbl.update_cell( - text="3", start_row_offset_idx=1, end_row_offset_idx=2, start_col_offset_idx=2, @@ -72,6 +73,7 @@ def test_construct_rich_table(): print(fifth_cell) + # Add a child item to the fifth cell to add rich content. doc.add_text(parent=fifth_cell, text="Foo", label=DocItemLabel.TEXT) # Test serialization From 46d51283ee2e23f88952fde531c2f70df0d856d0 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 6 May 2025 14:41:49 +0200 Subject: [PATCH 3/4] add HTML serialization for rich tables Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/html.py | 7 +- test/data/doc/rich_table_doc.html | 134 +++++++++++++++++++++ test/test_rich_tables.py | 56 +++++---- 3 files changed, 167 insertions(+), 30 deletions(-) create mode 100644 test/data/doc/rich_table_doc.html diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index 8ad8cbbd..9d8de3ea 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -329,11 +329,10 @@ def serialize( if colstart != j: continue + content = html.escape(cell.text.strip()) if cell.has_rich_content(): - # TODO: Do something that serializes the cell and its children - content = html.escape(cell.text.strip()) - else: - content = html.escape(cell.text.strip()) + cell_parts = doc_serializer.get_parts(item=cell) + content += "\n" + "\n".join([part.text for part in cell_parts]) celltag = "td" if cell.column_header: celltag = "th" diff --git a/test/data/doc/rich_table_doc.html b/test/data/doc/rich_table_doc.html new file mode 100644 index 00000000..d4088add --- /dev/null +++ b/test/data/doc/rich_table_doc.html @@ -0,0 +1,134 @@ + + + + +test_rich_table + + + + +
+
ABC
123 +
    +
  • foo
  • +
  • bar
  • +
+
+ + diff --git a/test/test_rich_tables.py b/test/test_rich_tables.py index 9f14bac8..aa909c47 100644 --- a/test/test_rich_tables.py +++ b/test/test_rich_tables.py @@ -1,10 +1,19 @@ -import json +from docling_core.types.doc.document import DoclingDocument, TableData -from docling_core.types.doc import DocItemLabel -from docling_core.types.doc.document import DoclingDocument, RefItem, TableData +from .test_data_gen_flag import GEN_TEST_DATA -def test_construct_rich_table(): +def _verify(act_data: str, filename: str, extension: str): + if GEN_TEST_DATA: + with open(filename + f".{extension}", "w", encoding="utf-8") as f: + f.write(f"{act_data}\n") + else: + with open(filename + f".{extension}", "r", encoding="utf-8") as f: + exp_data = f.read().rstrip() + assert exp_data == act_data + + +def _construct_doc(): num_cols = 3 num_rows = 3 @@ -12,7 +21,7 @@ def test_construct_rich_table(): data = TableData(num_rows=num_rows, num_cols=num_cols) tbl = doc.add_table(data=data) - first_cell = tbl.update_cell( + tbl.update_cell( text="AB", start_row_offset_idx=0, end_row_offset_idx=1, @@ -22,9 +31,7 @@ def test_construct_rich_table(): row_header=True, ) - print(first_cell) - - second_cell = tbl.update_cell( + tbl.update_cell( text="C", start_row_offset_idx=0, end_row_offset_idx=1, @@ -34,9 +41,7 @@ def test_construct_rich_table(): row_header=True, ) - print(second_cell) - - third_cell = tbl.update_cell( + tbl.update_cell( text="1", start_row_offset_idx=1, end_row_offset_idx=3, @@ -46,9 +51,7 @@ def test_construct_rich_table(): row_header=True, ) - print(third_cell) - - fourth_cell = tbl.update_cell( + tbl.update_cell( text="2", start_row_offset_idx=1, end_row_offset_idx=2, @@ -58,9 +61,7 @@ def test_construct_rich_table(): row_header=False, ) - print(fourth_cell) - - fifth_cell = tbl.update_cell( + rich_cell = tbl.update_cell( text="3", start_row_offset_idx=1, end_row_offset_idx=2, @@ -70,15 +71,18 @@ def test_construct_rich_table(): row_header=False, ) - print(fifth_cell) + list_node = doc.add_unordered_list(name="inner list", parent=rich_cell) + doc.add_list_item(text="foo", parent=list_node) + doc.add_list_item(text="bar", parent=list_node) + + return doc + + +def test_rich_table(): - doc.add_text(parent=fifth_cell, text="Foo", label=DocItemLabel.TEXT) + filename = "test/data/doc/rich_table_doc" - # Test serialization - print(tbl.export_to_markdown(doc)) - print(tbl.export_to_html(doc)) - print(json.dumps(doc.export_to_dict(), indent=2)) + doc = _construct_doc() - # Test resolution from table cell ref: - resolved = RefItem(cref="#/tables/0/data/table_cells/2").resolve(doc) - print(resolved) + html_pred = doc.export_to_html() + _verify(act_data=html_pred, filename=filename, extension="html") From 3d3ed5aeca7d25f6677f709b12abfc69c8def22c Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 6 May 2025 14:49:52 +0200 Subject: [PATCH 4/4] minor test refactor Signed-off-by: Panos Vagenas --- test/test_rich_tables.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/test/test_rich_tables.py b/test/test_rich_tables.py index aa909c47..e0893350 100644 --- a/test/test_rich_tables.py +++ b/test/test_rich_tables.py @@ -3,12 +3,12 @@ from .test_data_gen_flag import GEN_TEST_DATA -def _verify(act_data: str, filename: str, extension: str): +def _verify(act_data: str, exp_file: str): if GEN_TEST_DATA: - with open(filename + f".{extension}", "w", encoding="utf-8") as f: + with open(exp_file, "w", encoding="utf-8") as f: f.write(f"{act_data}\n") else: - with open(filename + f".{extension}", "r", encoding="utf-8") as f: + with open(exp_file, "r", encoding="utf-8") as f: exp_data = f.read().rstrip() assert exp_data == act_data @@ -80,9 +80,7 @@ def _construct_doc(): def test_rich_table(): - filename = "test/data/doc/rich_table_doc" - doc = _construct_doc() html_pred = doc.export_to_html() - _verify(act_data=html_pred, filename=filename, extension="html") + _verify(act_data=html_pred, exp_file="test/data/doc/rich_table_doc.html")