From b20b595118ae5b90a2e50855f1fd27246d76eb7f Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Wed, 12 Nov 2025 18:04:33 +0100
Subject: [PATCH 01/15] chore: Code comments

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                    | 2 ++
 docling_eval/evaluators/layout_evaluator.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 0b635cd0..388bfa6b 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -668,6 +668,8 @@ def evaluate(
         with open(save_fn, "w") as fd:
             json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
+        # TODO: Add also the pixel-wise layout evaluation
+
     elif modality == EvaluationModality.TABLE_STRUCTURE:
         table_evaluator = TableEvaluator()
         evaluation = table_evaluator(  # type: ignore
diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
index 1fb520e2..37b60af6 100644
--- a/docling_eval/evaluators/layout_evaluator.py
+++ b/docling_eval/evaluators/layout_evaluator.py
@@ -939,6 +939,11 @@ def _extract_layout_data(
         _log.debug(f"GT pages: {sorted(gt_pages)}, Pred pages: {sorted(pred_pages)}")
 
         # Process pages in sorted order to ensure consistent alignment
+        # List[Tuple[page_no, Dict[str, torch.Tensor]]]. The dict has tensors with bboxes, labels:
+        #    "boxes": torch.tensor(bboxes, dtype=torch.float32),
+        #    "labels": torch.tensor(labels, dtype=torch.long),
+        #    "scores": torch.tensor(scores, dtype=torch.float32)  # Only for the predictions
+        # The bboxes are in top-left origin, in x1y1x2y2 format, normalized and scaled to 100
         ground_truths: List[Tuple[int, Dict[str, torch.Tensor]]] = []
         predictions: List[Tuple[int, Dict[str, torch.Tensor]]] = []
 

From a379caef097867205d5fb4adb999dc0aba892c2c Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Wed, 12 Nov 2025 18:05:37 +0100
Subject: [PATCH 02/15] feat: WIP: Migrating the PixelLayoutEvaluator

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../pixel/confusion_matrix_exporter.py        | 547 ++++++++++++++++++
 .../pixel/multi_label_confusion_matrix.py     | 465 +++++++++++++++
 .../evaluators/pixel_layout_evaluator.py      | 332 +++++++++++
 3 files changed, 1344 insertions(+)
 create mode 100644 docling_eval/evaluators/pixel/confusion_matrix_exporter.py
 create mode 100644 docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
 create mode 100644 docling_eval/evaluators/pixel_layout_evaluator.py

diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
new file mode 100644
index 00000000..718f0537
--- /dev/null
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -0,0 +1,547 @@
+import argparse
+import colorsys
+import json
+import logging
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+from openpyxl import Workbook, load_workbook
+from openpyxl.styles import Border, Font, PatternFill, Side
+from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.worksheet import Worksheet
+from pandas import ExcelWriter
+
+# from src.utils.utils import discover_filename_prefix
+from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
+    MultiLabelConfusionMatrix,
+)
+
+_log = logging.getLogger(__name__)
+
+
+def linear_norm(x, x_min, x_max, k=5.0):
+    d = x_max - x_min
+    x = (x - x_min) / (x_max - x_min) if d != 0 else 0
+    return x
+
+
+def exp_norm(x, x_min, x_max, k=5.0):
+    d = x_max - x_min
+    x = (x - x_min) / (x_max - x_min) if d != 0 else 0
+    return 1 - np.exp(-k * x)
+
+
+def power_norm(x, x_min, x_max, p=0.3):
+    d = x_max - x_min
+    x = (x - x_min) / (x_max - x_min) if d != 0 else 0
+    return x**p
+
+
+class ConfusionMatrixExporter:
+    r""" """
+
+    TITLE_FONT_SIZE = 16
+    SUBTITLE_FONT_SIZE = 14
+    DATASET_WORKSHEET_NAME = "Dataset"
+    IMAGES_WORKSHEET_NAME = "Images"
+
+    def __init__(
+        self,
+    ):
+        self._background_color = "bbbbbb"
+        self._black_color = "444444"
+        self._power_normalization_exp = 0.2
+
+        border_style = "thick"
+        border_color = "fc5a8d"
+        self._highlighted_border = Border(
+            left=Side(border_style=border_style, color=border_color),
+            right=Side(border_style=border_style, color=border_color),
+            top=Side(border_style=border_style, color=border_color),
+            bottom=Side(border_style=border_style, color=border_color),
+        )
+
+    def export_excel_from_json(
+        self,
+        save_path: Path,
+        pixel_evaluations_fn: Path,
+    ):
+        r""" """
+        with open(pixel_evaluations_fn, "r") as fd:
+            pixel_evaluations = json.load(fd)
+
+        # Reconsturct the confusion matrix
+        confusion_matrix_as_list = pixel_evaluations["confusion_matrix"]
+        confusion_matrix = np.asarray(confusion_matrix_as_list, dtype=np.float32)
+
+        # Reconstruct the headers
+        class_names: dict[int, str] = pixel_evaluations["classes_names"]
+        headers = list(class_names.values())
+        save_path.mkdir(parents=True, exist_ok=True)
+        excel_fn = save_path / f"{pixel_evaluations_fn.stem}.xlsx"
+
+        # TODO:
+        # self._export_matrix_to_excel(confusion_matrix, headers, excel_fn)
+
+    def build_ds_report(
+        self,
+        model_name: str,
+        num_images: int,
+        num_pixels: int,
+        headers: list[str],
+        confusion_matrix: np.ndarray,
+        ds_metrics: dict,
+        colapsed_headers: list[str],
+        image_colaped_aggs: dict[str, np.ndarray],
+        excel_fn: Path,
+        visualisations_root: Optional[Path],
+    ):
+        r"""
+        Generate excel report for the full dataset
+        """
+        with pd.ExcelWriter(excel_fn, engine="openpyxl") as writer:
+            # Add the dataset header
+            wb: Workbook = writer.book
+            if not wb.worksheets:
+                wb.create_sheet(ConfusionMatrixExporter.DATASET_WORKSHEET_NAME)
+                wb.active = 0
+            ds_ws: Worksheet = wb.active  # type: ignore
+            ds_ws.cell(row=1, column=1).value = model_name
+            ds_ws.cell(row=1, column=1).font = Font(
+                bold=True, size=ConfusionMatrixExporter.TITLE_FONT_SIZE
+            )
+            ds_ws.cell(row=2, column=1).value = "#images"
+            ds_ws.cell(row=2, column=2).value = num_images
+            ds_ws.cell(row=3, column=1).value = "#pixels"
+            ds_ws.cell(row=3, column=2).value = num_pixels
+            ds_ws.cell(row=3, column=2).number_format = f"#,##0"
+
+            # Build the basic report
+            self._build_base_report(
+                writer,
+                ConfusionMatrixExporter.DATASET_WORKSHEET_NAME,
+                headers,
+                confusion_matrix,
+                ds_metrics,
+                4,
+            )
+
+            # Add the colapsed image metrics in a separate worksheet
+            self._aggregate_colapsed_image_metrics(
+                writer,
+                ConfusionMatrixExporter.IMAGES_WORKSHEET_NAME,
+                colapsed_headers,
+                image_colaped_aggs,
+                visualisations_root=visualisations_root,
+            )
+
+            # Adjust column widths
+            images_ws: Worksheet = wb[ConfusionMatrixExporter.IMAGES_WORKSHEET_NAME]
+            self._adjust_column_widths(ds_ws)
+            self._adjust_column_widths(images_ws)
+
+        _log.info("Dataset report: %s", str(excel_fn))
+
+    def build_image_report(
+        self,
+        headers: list[str],
+        confusion_matrix: np.ndarray,
+        metrics: dict,
+        excel_fn: Path,
+    ):
+        with pd.ExcelWriter(excel_fn, engine="openpyxl") as writer:
+            self._build_base_report(
+                writer,
+                ConfusionMatrixExporter.DATASET_WORKSHEET_NAME,
+                headers,
+                confusion_matrix,
+                metrics,
+            )
+
+            # Adjust column widths
+            ws: Worksheet = writer.book[ConfusionMatrixExporter.DATASET_WORKSHEET_NAME]
+            self._adjust_column_widths(ws)
+
+        _log.info("Image report: %s", str(excel_fn))
+
+    def _aggregate_colapsed_image_metrics(
+        self,
+        writer: ExcelWriter,
+        worksheet_name: str,
+        headers: list[str],
+        image_colapsed_aggs: dict[str, np.ndarray],
+        origin_cell: tuple[int, int] = (0, 0),
+        decimal_digits: int = 3,
+        visualisations_root: Optional[Path] = None,
+    ):
+        r"""
+        Aggregate all colapsed image metrics
+        """
+        startrow = origin_cell[0] + 1
+        startcol = origin_cell[1]
+
+        # Build the dataframe
+        index = list(image_colapsed_aggs.keys())
+        data = np.stack(list(image_colapsed_aggs.values()), axis=0)  # [num_images, 12]
+        data = np.round(data, decimals=3)
+        df = pd.DataFrame(data, index=index, columns=headers)
+
+        df.to_excel(
+            writer,
+            sheet_name=worksheet_name,
+            index=True,
+            startrow=startrow,
+            startcol=startcol,
+        )  # row/col index starts from 0
+
+        # Load workbook
+        wb: Workbook = writer.book
+        ws: Worksheet = wb[worksheet_name]
+
+        # Set the prediction visualisations as hyperlinks in the image filenames
+        # if visualisations_root:
+        #     viz_prefix = discover_filename_prefix(visualisations_root, "png")
+        #     if viz_prefix:
+        #         col = startcol + 1
+        #         for i, image_filename in enumerate(image_colapsed_aggs.keys()):
+        #             row = i + startrow + 2
+        #             cell = ws.cell(row=row, column=col)
+        #             viz_fn = visualisations_root / f"{viz_prefix}{image_filename}"
+        #             if not viz_fn.is_file():
+        #                 continue
+        #             cell.hyperlink = str(viz_fn)
+        #             cell.style = "Hyperlink"
+        #     else:
+        #         _log.error(
+        #             "Cannot the visualisation prefix in: %s", str(visualisations_root)
+        #         )
+
+        # Set the subtitle
+        subtitle_cell = ws.cell(
+            row=origin_cell[0] + 1, column=origin_cell[1] + 1
+        )  # start from 1
+        subtitle_cell.value = "Image colapsed classes metrics"
+        subtitle_cell.font = Font(
+            bold=True, size=ConfusionMatrixExporter.SUBTITLE_FONT_SIZE
+        )
+
+        # Get data min/max values
+        vmin = np.min(data)
+        vmax = np.max(data)
+
+        # Apply background colors to data cells
+        row_start = 2 + startrow  # start from 1
+        col_start = 2 + startcol
+        for i in range(data.shape[0]):
+            row = i + row_start
+            for j in range(data.shape[1]):
+                value = data[i, j]
+                col = j + col_start
+                if value == 0:
+                    # Treat zero values specially
+                    color = self._black_color
+                else:
+                    color = self._value_to_color(vmin, vmax, value, "linear")
+                ws.cell(row=row, column=col).fill = PatternFill(
+                    start_color=color, end_color=color, fill_type="solid"
+                )
+
+                # Format the numbers
+                decimals_format = "." + "0" * decimal_digits
+                ws.cell(row=row, column=col).number_format = f"#,##0{decimals_format}"
+        return ws.max_row, ws.max_column
+
+    def _build_base_report(
+        self,
+        writer: ExcelWriter,
+        worksheet_name: str,
+        headers: list[str],
+        confusion_matrix: np.ndarray,
+        metrics: dict,
+        startrow: int = 0,
+        hide_zero_rows: bool = True,
+        hide_zero_cols: bool = True,
+    ):
+        r"""
+        Generate excel report for a single image
+        """
+        detailed_spacing = 4  # spacing between the detailed matrices
+        colapsed_spacing = 2  # spacing between a detailed and the next colapsed matrix
+
+        colapsed_headers = [
+            headers[0],
+            MultiLabelConfusionMatrix.ALL_COLAPSED_CLASSES_NAME,
+        ]
+
+        # Add the confusion matrix
+        max_row, max_col = self._export_matrix_to_excel(
+            writer,
+            worksheet_name,
+            "Confusion Matrix",
+            confusion_matrix,
+            headers,
+            decimal_digits=3,
+            origin_cell=(startrow, 0),
+            normalization_func="linear",
+            hide_zero_rows=hide_zero_rows,
+            hide_zero_cols=hide_zero_cols,
+        )
+
+        # Add the precision matrix with detailed classes
+        detailed_precision_row = max_row + detailed_spacing
+        colapsed_precision_row = max_row + colapsed_spacing
+        detailed_precision_matrix: np.ndarray = metrics[
+            MultiLabelConfusionMatrix.DETAILED_METRICS_KEY
+        ]["precision_matrix"]
+        max_row, max_col = self._export_matrix_to_excel(
+            writer,
+            worksheet_name,
+            "Precision Matrix",
+            detailed_precision_matrix,
+            headers,
+            decimal_digits=3,
+            origin_cell=(detailed_precision_row, 0),
+            normalization_func="linear",
+            hide_zero_rows=hide_zero_rows,
+            hide_zero_cols=hide_zero_cols,
+        )
+        detailed_recall_row = max_row + detailed_spacing
+        colapsed_recall_row = max_row + colapsed_spacing
+        colapsed_col = max_col + 1
+
+        # Add the precision matrix with colapsed classes
+        colapsed_precision_matrix: np.ndarray = metrics[
+            MultiLabelConfusionMatrix.COLAPSED_METRICS_KEY
+        ]["precision_matrix"]
+        self._export_matrix_to_excel(
+            writer,
+            worksheet_name,
+            "Colapsed Precision Matrix",
+            colapsed_precision_matrix,
+            colapsed_headers,
+            decimal_digits=3,
+            origin_cell=(colapsed_precision_row, colapsed_col),
+            normalization_func="linear",
+            hide_zero_rows=hide_zero_rows,
+            hide_zero_cols=hide_zero_cols,
+        )
+
+        # Add the recall matrix with detailed classes
+        detailed_recall_matrix: np.ndarray = metrics[
+            MultiLabelConfusionMatrix.DETAILED_METRICS_KEY
+        ]["recall_matrix"]
+        max_row, max_col = self._export_matrix_to_excel(
+            writer,
+            worksheet_name,
+            "Recall matrix",
+            detailed_recall_matrix,
+            headers,
+            decimal_digits=3,
+            origin_cell=(detailed_recall_row, 0),
+            normalization_func="linear",
+            hide_zero_rows=hide_zero_rows,
+            hide_zero_cols=hide_zero_cols,
+        )
+
+        # Add the recall matrix with colapsed classes
+        colapsed_recall_matrix: np.ndarray = metrics[
+            MultiLabelConfusionMatrix.COLAPSED_METRICS_KEY
+        ]["recall_matrix"]
+        self._export_matrix_to_excel(
+            writer,
+            worksheet_name,
+            "Colapsed Recall Matrix",
+            colapsed_recall_matrix,
+            colapsed_headers,
+            decimal_digits=3,
+            origin_cell=(colapsed_recall_row, colapsed_col),
+            normalization_func="linear",
+            hide_zero_rows=hide_zero_rows,
+            hide_zero_cols=hide_zero_cols,
+        )
+
+    def _export_matrix_to_excel(
+        self,
+        writer: ExcelWriter,
+        worksheet_name: str,
+        title: str,
+        data: np.ndarray,
+        headers: list[str],
+        origin_cell: tuple[int, int] = (0, 0),
+        special_first_cell: bool = False,
+        decimal_digits: int = 0,
+        normalization_func: str = "linear",  # One of 'linear', 'power', 'exp'
+        hide_zero_rows: bool = False,
+        hide_zero_cols: bool = False,
+    ) -> tuple[int, int]:
+        r"""
+        Export the given data in excel and place it in the origin_cell
+
+        Returns:
+        --------
+        max_row
+        max_col
+        """
+        startrow = origin_cell[0]
+        startcol = origin_cell[1]
+
+        # Round values
+        data = np.round(data, decimals=3)
+
+        # Create DataFrame and write to Excel
+        df = pd.DataFrame(data, index=headers, columns=headers)
+        df.to_excel(
+            writer,
+            sheet_name=worksheet_name,
+            index=True,
+            startrow=startrow,
+            startcol=startcol,
+        )  # row/col index starts from 0
+
+        # Load workbook
+        wb: Workbook = writer.book
+        ws: Worksheet = wb[worksheet_name]
+
+        # Set the subtitle in the corner of the data
+        subtitle_cell = ws.cell(
+            row=origin_cell[0] + 1, column=origin_cell[1] + 1
+        )  # start from 1
+        subtitle_cell.value = title
+        subtitle_cell.font = Font(
+            bold=True, size=ConfusionMatrixExporter.SUBTITLE_FONT_SIZE
+        )
+
+        # Get the min/max values
+        if special_first_cell:
+            # Don't account for the first value
+            confusion_mask = np.ones(data.shape, dtype=np.uint8)
+            confusion_mask[0, 0] = 0
+            vmin = np.min(data, initial=0, where=confusion_mask != 0)
+            vmax = np.max(data, initial=0, where=confusion_mask != 0)
+        else:
+            vmin = np.min(data)
+            vmax = np.max(data)
+
+        # Apply background colors to data cells
+        style_startrow = 2 + startrow  # start from 1
+        style_startcol = 2 + startcol
+        for i in range(len(headers)):
+            row = i + style_startrow
+            for j in range(len(headers)):
+                col = j + style_startcol
+                # Treat the background specially
+                if i == 0 and j == 0 and special_first_cell:
+                    ws.cell(row=row, column=col).fill = PatternFill(
+                        start_color=self._background_color,
+                        end_color=self._background_color,
+                        fill_type="solid",
+                    )
+                else:
+                    value = data[i, j]
+                    if value == 0:
+                        # Treat zero values specially
+                        color = self._black_color
+                    else:
+                        color = self._value_to_color(
+                            vmin, vmax, value, normalization_func
+                        )
+                    ws.cell(row=row, column=col).fill = PatternFill(
+                        start_color=color, end_color=color, fill_type="solid"
+                    )
+
+                # Highlight the diagonal
+                if i == j:
+                    ws.cell(row=row, column=col).border = self._highlighted_border
+
+                # Format the numbers
+                decimals_format = ""
+                if decimal_digits > 0:
+                    decimals_format = "." + "0" * decimal_digits
+                ws.cell(row=row, column=col).number_format = f"#,##0{decimals_format}"
+
+        # Hide rows/cols with all zeros
+        if hide_zero_cols:
+            colsums = np.sum(data, axis=0)
+            zero_col_indices = np.nonzero(colsums == 0)[0]  # Zero column indices
+            for zero_col_idx in zero_col_indices:
+                col_idx = zero_col_idx + startcol + 2
+                col_letter = get_column_letter(col_idx)
+                ws.column_dimensions[col_letter].hidden = True
+
+        if hide_zero_rows:
+            rowsums = np.sum(data, axis=1)
+            zero_row_indices = np.nonzero(rowsums == 0)[0]  # Zero row indices
+            for zero_row_idx in zero_row_indices:
+                row_idx = zero_row_idx + startrow + 2
+                ws.row_dimensions[row_idx].hidden = True
+
+        return ws.max_row, ws.max_column
+
+    def _value_to_color(self, vmin, vmax, v, normalization_func: str):
+        """Map value to RGB color from blue→red using rainbow spectrum."""
+        # Normalize to [0,1]
+        if normalization_func == "power":
+            normalized_value = power_norm(
+                v, vmin, vmax, p=self._power_normalization_exp
+            )
+        elif normalization_func == "exp":
+            normalized_value = exp_norm(v, vmin, vmax)
+        else:
+            normalized_value = linear_norm(v, vmin, vmax)
+
+        # Use HSV rainbow mapping: hue 240° (blue) -> 0° (red)
+        hue = (1 - normalized_value) * 240 / 360  # convert degrees to [0,1]
+        r, g, b = colorsys.hsv_to_rgb(hue, 1, 1)
+
+        # Convert to hex color for Excel
+        hex_color = f"{int(r * 255):02X}{int(g * 255):02X}{int(b * 255):02X}"
+        return hex_color
+
+    def _adjust_column_widths(self, ws: Worksheet):
+        r"""Adjust column widths for the final excel"""
+        for col in ws.columns:
+            max_length = 0
+            col_letter = col[0].column_letter  # type: ignore
+            for cell in col:
+                val = str(cell.value)
+                max_length = max(max_length, len(val))
+            ws.column_dimensions[col_letter].width = max_length + 2
+
+
+def main():
+    r""" """
+    # Parse CLI arguments
+    parser = argparse.ArgumentParser(
+        description="Run the PixelLayoutEvaluator with the GT in COCO-format and the predictions in COCO-tools format"
+    )
+    parser.add_argument(
+        "-s",
+        "--save_dir",
+        type=Path,
+        required=True,
+        help="Root save directory to save the exported files",
+    )
+    parser.add_argument(
+        "-p",
+        "--pixel_evaluations",
+        type=Path,
+        required=True,
+        help="Json with the pixel evaluations",
+    )
+    args = parser.parse_args()
+    save_dir = args.save_dir
+    pixel_eval_fn = args.pixel_evaluations
+
+    # Configure logger
+    log_format = "%(asctime)s - %(levelname)s - %(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_format)
+
+    # Initialize the exporter
+    exporter = ConfusionMatrixExporter()
+    exporter.export_excel_from_json(save_dir, pixel_eval_fn)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
new file mode 100644
index 00000000..658fa4dc
--- /dev/null
+++ b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
@@ -0,0 +1,465 @@
+import logging
+import math
+from typing import Optional
+
+import numpy as np
+from pydantic import BaseModel
+
+_log = logging.getLogger(__name__)
+
+
+class LayoutResolution(BaseModel):
+    r"""Single bbox resolution"""
+
+    category_id: int
+
+    # bbox coords: (x1, y1, x2, y2) with the origin(0, 0) at the top, left corner, no normalization
+    bbox: list[float]
+
+
+def unpackbits(x: np.ndarray, num_bits: int):
+    r"""
+    Unpack num_bits bits of each element of the numpy array x
+    The number of bits defines how many bits we will take from x to unpack.
+    """
+    xshape = list(x.shape)
+    x = x.reshape([-1, 1])
+    mask = 2 ** np.arange(num_bits, dtype=x.dtype).reshape([1, num_bits])
+    return (x & mask).astype(bool).astype(int).reshape(xshape + [num_bits])
+
+
+class MultiLabelConfusionMatrix:
+    r""" """
+
+    DETAILED_METRICS_KEY = "detailed_classes"
+    COLAPSED_METRICS_KEY = "colapsed_classes"
+    ALL_COLAPSED_CLASSES_NAME = "all_classes"
+
+    def __init__(
+        self,
+        validation_mode: str = "disabled",
+    ):
+        r"""
+        The validation mode can be one of: ["disabled", "log", "raise"]
+        """
+        self._validation_mode = validation_mode
+
+    def make_binary_representation(
+        self,
+        image_width: int,
+        image_height: int,
+        resolutions: list[LayoutResolution],
+        set_background: bool = True,
+    ) -> np.ndarray:
+        r"""
+        Create a numpy matrix with the binary representation of the layout resolutions
+        Each pixel is represented as one uint64 integer, where the 1-bit flags presence of a class.
+
+        Parameters
+        ----------
+        set_background: Assign the value 1 to all pixels that still have a zero value in the end
+
+        Returns
+        -------
+        np.ndarray with the binary representation of the resolutions. Dims are equal to the image size
+        """
+        # Initialize the representation matrix with 0
+        matrix = np.zeros(
+            (
+                image_height,
+                image_width,
+            ),
+            dtype=np.uint64,
+        )
+
+        for res in resolutions:
+            x1 = res.bbox[0]
+            y1 = res.bbox[1]
+            x2 = res.bbox[2]
+            y2 = res.bbox[3]
+            x_begin = math.floor(x1)
+            x_end = math.ceil(x2)
+            y_begin = math.floor(y1)
+            y_end = math.ceil(y2)
+
+            cat_id = res.category_id
+            bit_index = np.uint64(1 << cat_id)
+            matrix[y_begin:y_end, x_begin:x_end] |= bit_index
+
+        # Set the background class (binary 1) if there is no other class set
+        if set_background:
+            matrix[matrix == 0] = 1
+
+        return matrix
+
+    def generate_confusion_matrix(
+        self,
+        gt: np.ndarray,
+        preds: np.ndarray,
+        canonical_categories: list[int],
+    ) -> np.ndarray:
+        r"""
+        Create the confusion matrix for multi-label predictions.
+        The returned matrix can be used to compute precision, recall.
+
+        In a perfect prediction the matrix should be diagonal. All elements outside of the main
+        diagonal indicate prediction errors and contribute to penalties in the calculation of
+        precision, recall.
+
+        Inspired by "Multi-label classifier performance evaluation with confusion matrix"
+        https://csitcp.org/paper/10/108csit01.pdf
+
+        Returns
+        -------
+        np.ndarray [num_categories + 1, num_categories + 1]. The +1 is for the background class
+        """
+        img_height, img_width = gt.shape
+        num_categories = len(canonical_categories)
+
+        # confusion_matrix: [num_categories, num_categories]
+        confusion_matrix = np.zeros((num_categories, num_categories), dtype=float)
+        eye = np.eye(num_categories)
+
+        ############################################################################################
+        # Case 1: Perfect prediction
+        #
+
+        # [img_height, img_width]
+        selections_case1 = gt == preds
+
+        # 1. I = np.eye(num_categories): [num_categories, num_categories]
+        # 2. U = unpackbits(gt[selections_case1], num_categories)]: [k, num_categories]
+        # 3. C = U[:, None, :] * I[None, :, :]
+        # U = unpackbits(gt[selections_case1], num_categories)
+        # C = U[:, None, :] * eye[None, :, :]
+
+        # case1_contributions: [num_pixels_with_perfect_preds, num_categories, num_categories]
+        case1_contributions = (
+            unpackbits(gt[selections_case1], num_categories)[:, None, :]
+            * eye[None, :, :]
+        )
+
+        # print("Case1 contributions:")
+        # print(case1_contributions)
+
+        # Validate the contributions
+        self._validate_contributions(gt[selections_case1], case1_contributions, "Case1")
+
+        confusion_matrix += np.sum(case1_contributions, axis=0)
+
+        ############################################################################################
+        # Case 2: Prediction has all GT plus extra mistakes
+        #
+
+        # Filter out the non-perfect predictions to take the ones where preds contain all GT bits
+        # [img_height, img_width]
+        selections_case2 = ~selections_case1
+        selections_case2[selections_case2 == True] = (
+            gt[selections_case2] & preds[selections_case2] == gt[selections_case2]
+        )
+
+        # [num_pixels_with_extra_preds, num_categories]
+        case2_preds_gt_intersection = preds[selections_case2] & gt[selections_case2]
+        case2_preds_gt_intersection = unpackbits(
+            case2_preds_gt_intersection, num_categories
+        )
+
+        # [num_pixels_with_extra_preds,]
+        case2_preds_pixels = preds[selections_case2]
+        if len(case2_preds_pixels) > 0:
+            # [num_pixels_with_extra_preds, num_categories]
+            case2_preds_gt_diff = (
+                case2_preds_pixels ^ gt[selections_case2]
+            ) & case2_preds_pixels
+            case2_preds_gt_diff = unpackbits(case2_preds_gt_diff, num_categories)
+
+            # [num_pixels_with_extra_preds, num_categories, num_categories]
+            case2_penalty = (
+                case2_preds_gt_intersection[:, :, None]
+                * case2_preds_gt_diff[:, None, :]
+            )
+
+            # [num_pixels_with_extra_preds, num_categories, num_categories]
+            case2_gt_diagonals = (
+                unpackbits(gt[selections_case2], num_categories)[:, None, :]
+                * eye[None, :, :]
+            )
+
+            # [num_pixels_with_extra_preds,]
+            case2_gt_multiplier = np.bitwise_count(gt[selections_case2])
+
+            # [num_pixels_with_extra_preds,]
+            case2_preds_divider = np.bitwise_count(case2_preds_pixels)
+
+            # [num_pixels_with_extra_preds, num_categories, num_categories]
+            case2_gain = case2_gt_multiplier[:, None, None] * case2_gt_diagonals
+
+            # [num_pixels_with_extra_preds, num_categories, num_categories]
+            case2_contributions = (case2_penalty + case2_gain) / case2_preds_divider[
+                :, None, None
+            ]
+            # print("Case2 contributions:")
+            # print(case2_contributions)
+
+            # Validate the contributions
+            self._validate_contributions(
+                gt[selections_case2], case2_contributions, "Case2"
+            )
+
+            confusion_matrix += np.sum(case2_contributions, axis=0)
+
+        ############################################################################################
+        # Case 3: GT has more labels than preds
+        # NOTICE: This case NEVER happens for us because our GT has only 1 label
+        #
+
+        # [img_height, img_width]
+        selections_case3 = ~selections_case1
+        selections_case3[selections_case3 == True] = (
+            gt[selections_case3] | preds[selections_case3] == gt[selections_case3]
+        )
+
+        # [num_pixels_with_additional_gt_labels,]
+        case3_preds_pixels = preds[selections_case3]
+        if len(case3_preds_pixels) > 0:
+            # [num_pixels_with_additional_gt_labels, num_categories]
+            case3_gt_preds_diff = (case3_preds_pixels ^ gt[selections_case3]) & gt[
+                selections_case3
+            ]
+            case3_gt_preds_diff = unpackbits(case3_gt_preds_diff, num_categories)
+
+            # [num_pixels_with_additional_gt_labels,]
+            case3_preds_divider = np.bitwise_count(case3_preds_pixels)
+
+            # [num_pixels_with_additional_gt_labels, num_categories, num_categories]
+            case3_preds_diagonals = (
+                unpackbits(case3_preds_pixels, num_categories)[:, None, :]
+                * eye[None, :, :]
+            )
+
+            # [num_pixels_with_additional_gt_labels, num_categories]
+            case3_preds = unpackbits(case3_preds_pixels, num_categories)
+
+            # [num_pixels_with_additional_gt_labels, num_categories, num_categories]
+            case3_penalty = (
+                case3_gt_preds_diff[:, :, None] * case3_preds[:, None, :]
+            ) / case3_preds_divider[:, None, None]
+
+            # [num_pixels_with_additional_gt_labels, num_categories, num_categories]
+            case3_contributions = case3_penalty + case3_preds_diagonals
+            # print("Case3 contributions:")
+            # print(case3_contributions)
+
+            # Validate the contributions
+            self._validate_contributions(
+                gt[selections_case3], case3_contributions, "Case3"
+            )
+
+            confusion_matrix += np.sum(case3_contributions, axis=0)
+
+        ############################################################################################
+        # Case 4: Both GT and preds contain labels that are missing from the other one
+        #
+
+        # [img_height, img_width]
+        general_diff = gt ^ preds
+        selections_case4 = np.logical_and(
+            (general_diff & gt) > 0, (general_diff & preds) > 0
+        )
+
+        # [num_pixels_with_mutual_gt_pred_deltas, num_categories]
+        case4_gt_preds_diff = (preds[selections_case4] ^ gt[selections_case4]) & gt[
+            selections_case4
+        ]
+        case4_gt_preds_diff = unpackbits(case4_gt_preds_diff, num_categories)
+
+        # [num_pixels_with_mutual_gt_pred_deltas, num_categories]
+        case4_preds_gt_diff = (preds[selections_case4] ^ gt[selections_case4]) & preds[
+            selections_case4
+        ]
+        if len(case4_preds_gt_diff) > 0:
+            case4_divider = np.bitwise_count(case4_preds_gt_diff)
+            case4_preds_gt_diff = unpackbits(case4_preds_gt_diff, num_categories)
+
+            # [num_pixels_with_mutual_gt_pred_deltas, num_categories]
+            case4_preds_gt_intersection = preds[selections_case4] & gt[selections_case4]
+
+            # [num_pixels_with_mutual_gt_pred_deltas, num_categories, num_categories]
+            case4_preds_gt_intersection_diagonals = (
+                unpackbits(case4_preds_gt_intersection, num_categories)[:, None, :]
+                * eye[None, :, :]
+            )
+
+            # [num_pixels_with_mutual_gt_pred_deltas, num_categories, num_categories]
+            case4_penalty = (
+                case4_gt_preds_diff[:, :, None] * case4_preds_gt_diff[:, None, :]
+            ) / case4_divider[:, None, None]
+            case4_contributions = case4_penalty + case4_preds_gt_intersection_diagonals
+            # print("Case4 contributions:")
+            # print(case4_contributions)
+
+            # Validate the contributions
+            self._validate_contributions(
+                gt[selections_case4], case4_contributions, "Case4"
+            )
+
+            confusion_matrix += np.sum(case4_contributions, axis=0)
+
+        return confusion_matrix
+
+    def compute_metrics(
+        self,
+        confusion_matrix: np.ndarray,
+        class_names: dict[int, str],
+        colapse_non_bg: bool = False,
+    ) -> dict:
+        r"""
+        Parameters:
+        -----------
+        confusion_matrix: np.ndarray[num_categories + 1, num_categories + 1]
+        class_names: Mapping from class_id to class_names
+        colapse_non_bg: Colapse all classes except of the first one that is assumed to be the BG
+        """
+        # Compute metrics on the full confusion matrix
+        all_classes_metrics = self._compute_metrics_on_confusion(
+            confusion_matrix, class_names
+        )
+        metrics = {
+            MultiLabelConfusionMatrix.DETAILED_METRICS_KEY: all_classes_metrics,
+        }
+
+        if colapse_non_bg:
+            # Colapse the classes except the background and compute metrics again
+            colapsed_confusion_matrix = np.asarray(
+                [
+                    [confusion_matrix[0, 0], np.sum(confusion_matrix[0, 1:])],
+                    [np.sum(confusion_matrix[1:, 0]), np.sum(confusion_matrix[1:, 1:])],
+                ]
+            )
+            colapsed_class_names = {
+                0: class_names[0],
+                1: MultiLabelConfusionMatrix.ALL_COLAPSED_CLASSES_NAME,
+            }
+            colapsed_metrics = self._compute_metrics_on_confusion(
+                colapsed_confusion_matrix,
+                colapsed_class_names,
+            )
+            metrics[MultiLabelConfusionMatrix.COLAPSED_METRICS_KEY] = colapsed_metrics
+
+        return metrics
+
+    def _compute_metrics_on_confusion(
+        self,
+        confusion_matrix: np.ndarray,
+        class_names: dict[int, str],
+    ):
+        col_sums = np.sum(confusion_matrix, axis=0)
+        row_sums = np.sum(confusion_matrix, axis=1)
+
+        # Compute precision_matrix and recall_matrix
+        precision_matrix = np.divide(
+            confusion_matrix,
+            col_sums[None, :],
+            out=np.zeros(confusion_matrix.shape),
+            where=col_sums[None, :] != 0,
+        )
+        recall_matrix = np.divide(
+            confusion_matrix,
+            row_sums[:, None],
+            out=np.zeros(confusion_matrix.shape),
+            where=row_sums[:, None] != 0,
+        )
+        # Compute the f1 matrix element-wise
+        f1_matrix_nom = 2 * precision_matrix * recall_matrix
+        f1_matrix_denom = precision_matrix + recall_matrix
+        f1_matrix = np.divide(
+            f1_matrix_nom,
+            f1_matrix_denom,
+            out=np.zeros(confusion_matrix.shape),
+            where=f1_matrix_denom != 0,
+        )
+
+        # Extract diagonal vectors
+        precision = np.diag(precision_matrix)
+        recall = np.diag(recall_matrix)
+        f1 = np.diag(f1_matrix)
+        precision_mean = np.average(precision)
+        recall_mean = np.average(recall)
+        f1_mean = np.average(f1)
+
+        # Generate dicts with metrics per class name
+        def get_class_name(class_id: int) -> str:
+            return class_names[class_id]
+
+        def array_to_dict(a: np.ndarray) -> dict[str, float]:
+            a_dict = {get_class_name(i): float(x) for i, x in enumerate(a)}
+            return a_dict
+
+        precision_dict = array_to_dict(precision)
+        recall_dict = array_to_dict(recall)
+        f1_dict = array_to_dict(f1)
+
+        metrics = {
+            "precision_matrix": precision_matrix,
+            "recall_matrix": recall_matrix,
+            "f1_matrix": f1_matrix,
+            "classes_precision": precision_dict,
+            "classes_recall": recall_dict,
+            "classes_f1": f1_dict,
+            "classes_precision_mean": float(precision_mean),
+            "classes_recall_mean": float(recall_mean),
+            "classes_f1_mean": float(f1_mean),
+        }
+        return metrics
+
+    def _validate_contributions(
+        self,
+        selected_gt: np.ndarray,
+        contributions: np.ndarray,
+        info: str,
+    ):
+        r"""
+        Each contribution has the properties:
+        1. The sum of each row corresponding to labels in GT is equal to one.
+        2. The sum of all elements is equal to cardinality of GT.
+
+        The validation is controled by self._validation_mode:
+        - "disabled": No validation
+        - "raise": Raise a ValueError
+        - "log": Write an error log message
+
+        Parameters:
+        -----------
+        selected_gt: np.ndarray  1D array with size=selected_pixels, each pixel is a uint64 encoding
+        contributions: np.ndarray  [selected_pixels, num_classes, num_classes]
+        """
+        if self._validation_mode == "disabled":
+            return
+
+        contributions_shape = contributions.shape
+        if len(contributions_shape) != 3:
+            return
+
+        num_categories = contributions_shape[1]
+
+        selected_pixels = np.prod(selected_gt.shape)
+        if selected_pixels != contributions_shape[0]:
+            self._handle_error(f"{info}: Wrong contributions dimension")
+
+        # Row sum check
+        row_sum = np.sum(contributions, axis=2)
+        expected_row_sum = unpackbits(selected_gt, num_categories)
+        if not np.all(row_sum == expected_row_sum):
+            self._handle_error(f"{info}: Wrong contributions row sums")
+
+        # Full sum check
+        full_sum = np.sum(row_sum)
+        expected_full_sum = np.sum(np.bitwise_count(selected_gt))
+        if full_sum != expected_full_sum:
+            self._handle_error(f"{info}: Wrong contributions full sums")
+
+    def _handle_error(self, msg: str):
+        if self._validation_mode == "raise":
+            raise ValueError(msg)
+        else:
+            _log.error(msg)
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
new file mode 100644
index 00000000..1d434b2e
--- /dev/null
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -0,0 +1,332 @@
+import glob
+import logging
+from collections import defaultdict
+from enum import Enum
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from datasets import Dataset, load_dataset
+from docling_core.types.doc.document import (
+    DEFAULT_EXPORT_LABELS,
+    ContentLayer,
+    DocItem,
+    DoclingDocument,
+)
+from docling_core.types.doc.labels import DocItemLabel
+from tqdm import tqdm  # type: ignore
+
+from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
+from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats
+from docling_eval.evaluators.base_evaluator import (
+    BaseEvaluator,
+    EvaluationRejectionType,
+    docling_document_from_doctags,
+)
+from docling_eval.evaluators.layout_evaluator import MissingPredictionStrategy
+from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
+    LayoutResolution,
+    MultiLabelConfusionMatrix,
+)
+
+_log = logging.getLogger(__name__)
+
+
+class PixelLayoutEvaluator(BaseEvaluator):
+    r"""
+    Evaluate the document layout by computing a pixel-level confusion matrix and derivative matrices
+    (precision, recall, f1).
+    """
+
+    def __init__(
+        self,
+        label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
+        intermediate_evaluations_path: Optional[Path] = None,
+        prediction_sources: List[PredictionFormats] = [],
+        missing_prediction_strategy: MissingPredictionStrategy = MissingPredictionStrategy.PENALIZE,
+    ):
+        r"""
+
+        Parameters:
+        -----------
+        label_mapping: Optional parameter to map DocItemLabels to other DocItemLabels.
+                       If a label is mapped to None, it means not to use that label
+        """
+        supported_prediction_formats: List[PredictionFormats] = [
+            PredictionFormats.DOCLING_DOCUMENT,
+            PredictionFormats.DOCTAGS,
+            PredictionFormats.JSON,
+            PredictionFormats.YAML,
+        ]
+        if not prediction_sources:
+            prediction_sources = supported_prediction_formats
+        super().__init__(
+            intermediate_evaluations_path=intermediate_evaluations_path,
+            prediction_sources=prediction_sources,
+            supported_prediction_formats=supported_prediction_formats,
+        )
+
+        self._missing_prediction_strategy = missing_prediction_strategy
+
+        # Initialize the multi label confusion matrix calculator
+        self._mlcm = MultiLabelConfusionMatrix(validation_mode="disabled")
+
+        self._set_categories(label_mapping)
+
+    def _set_categories(
+        self,
+        label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
+    ):
+        r"""
+        Set the categories index and reversed index
+        """
+        label_to_id: dict[str, int] = {
+            label: i for i, label in enumerate(DEFAULT_EXPORT_LABELS)
+        }
+
+        self._category_name_to_id: Dict[str, int] = {}
+        if label_mapping:
+            for label in DEFAULT_EXPORT_LABELS:
+                if label in label_mapping:
+                    mapped_label = label_mapping.get(label)
+                    if not mapped_label:  # Skip a label that maps to None
+                        continue
+                    self._category_name_to_id[label] = label_to_id[mapped_label]
+                else:
+                    self._category_name_to_id[label] = label_to_id[label]
+        else:
+            self._category_name_to_id = label_to_id
+
+        self._category_id_to_name: Dict[int, str] = {
+            cat_id: cat_name for cat_name, cat_id in self._category_name_to_id.items()
+        }
+
+    def __call__(
+        self,
+        ds_path: Path,
+        split: str = "test",
+    ):
+        _log.info("Loading the split '%s' from: '%s'", split, ds_path)
+
+        # Load the dataset
+        split_path = str(ds_path / split / "*.parquet")
+        split_files = glob.glob(split_path)
+        _log.info("#-files: %s", len(split_files))
+        ds = load_dataset("parquet", data_files={split: split_files})
+        _log.info("Overview of dataset: %s", ds)
+
+        # Select the split
+        ds_selection: Dataset = ds[split]
+
+        # Results containers
+        rejected_samples: Dict[EvaluationRejectionType, int] = {
+            EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0,
+            EvaluationRejectionType.MISSING_PREDICTION: 0,
+            EvaluationRejectionType.MISMATHCED_DOCUMENT: 0,
+        }
+        doc_stats: Dict[str, Dict[str, int]] = {}
+
+        matrix_categories_ids: List[int] = list(self._category_id_to_name.keys())
+        num_categories = len(matrix_categories_ids)
+        confusion_matrix_sum = np.zeros((num_categories, num_categories))
+
+        for i, data in tqdm(
+            enumerate(ds_selection),
+            desc="Layout evaluations",
+            ncols=120,
+            total=len(ds_selection),
+        ):
+            data_record = DatasetRecordWithPrediction.model_validate(data)
+            doc_id = data_record.doc_id
+            if data_record.status not in self._accepted_status:
+                _log.error(
+                    "Skipping record without successfull conversion status: %s", doc_id
+                )
+                rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
+                continue
+
+            true_doc = data_record.ground_truth_doc
+            pred_doc = self._get_pred_doc(data_record)
+            if not pred_doc:
+                _log.error("There is no prediction for doc_id=%s", doc_id)
+                rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
+                continue
+
+            # TODO: Check in the end if to optimize the memory allocation for the intermediate CMs
+            doc_cm = self._compute_document_confusion_matrix(true_doc, pred_doc)
+
+            # TODO: Check if to compute metrics per document
+            confusion_matrix_sum += doc_cm
+
+        # TODO: Compute metrics
+        ds_metrics = self._mlcm.compute_metrics(
+            confusion_matrix_sum,
+            self._category_id_to_name,
+            True,
+        )
+
+    def _compute_document_confusion_matrix(
+        self,
+        true_doc: DoclingDocument,
+        pred_doc: DoclingDocument,
+    ) -> np.ndarray:
+        r"""
+        Compute the confusion matrix for the given documents.
+        This is the sum of the confusion matrices of the document pages.
+        """
+
+        # Collect all DocItems by page for both GT and predictions
+        true_pages_to_objects = self._collect_items_by_page(true_doc)
+        pred_pages_to_objects = self._collect_items_by_page(pred_doc)
+
+        # Get all pages that have GT data (we evaluate based on GT pages)
+        gt_pages = set(true_pages_to_objects.keys())
+        pred_pages = set(pred_pages_to_objects.keys())
+        _log.debug(f"GT pages: {sorted(gt_pages)}, Pred pages: {sorted(pred_pages)}")
+
+        matrix_categories_ids: List[int] = list(self._category_id_to_name.keys())
+        num_categories = len(matrix_categories_ids)
+        off_diagonal_cells = num_categories * num_categories - num_categories
+        confusion_matrix_sum = np.zeros((num_categories, num_categories))
+        # num_images = 0
+        # num_pixels = 0
+        # all_image_metrics: dict[str, dict] = {}  # image_filename -> image_metrics
+
+        for page_no in sorted(gt_pages):
+            page_size = true_doc.pages[page_no].size
+            pg_width = page_size.width
+            pg_height = page_size.height
+
+            # Always process GT for this page
+            gt_layouts = self._get_page_layout_resolution(
+                page_no=page_no,
+                items=true_pages_to_objects[page_no],
+                doc=true_doc,
+            )
+
+            # Handle prediction for this page based on strategy
+            if page_no in pred_pages:
+                # We have prediction data for this page
+                pred_layouts = self._get_page_layout_resolution(
+                    page_no=page_no,
+                    items=pred_pages_to_objects[page_no],
+                    doc=pred_doc,
+                )
+
+                # Compute the confusion matrix
+                gt_binary = self._mlcm.make_binary_representation(
+                    pg_width, pg_height, gt_layouts
+                )
+                preds_binary = self._mlcm.make_binary_representation(
+                    pg_width, pg_height, pred_layouts
+                )
+                confusion_matrix_sum += self._mlcm.generate_confusion_matrix(
+                    gt_binary, preds_binary, matrix_categories_ids
+                )
+            else:
+                # No prediction data for this page
+                if (
+                    self._missing_prediction_strategy
+                    == MissingPredictionStrategy.PENALIZE
+                ):
+                    # Create a penalty confusion matrix
+                    image_pixels = pg_width * pg_height
+                    penalty_value = image_pixels / off_diagonal_cells
+                    confusion_matrix_sum += penalty_value * (
+                        np.ones((num_categories, num_categories))
+                        - np.eye(num_categories)
+                    )
+                elif (
+                    self._missing_prediction_strategy
+                    == MissingPredictionStrategy.IGNORE
+                ):
+                    # Skip this page entirely
+                    continue
+                else:
+                    raise ValueError(
+                        f"Unknown missing prediction strategy: {self._missing_prediction_strategy}"
+                    )
+        return confusion_matrix_sum
+
+    def _get_page_layout_resolution(
+        self,
+        page_no: int,
+        items: List[DocItem],
+        doc: DoclingDocument,
+    ) -> List[LayoutResolution]:
+        r"""
+        Generate a list of LayoutResolution objects for the given document page
+        Each LayoutResolution corresponds to one bbox and its category_id
+        """
+        page_size = doc.pages[page_no].size
+        page_height = page_size.height
+
+        resolutions: List[LayoutResolution] = []
+        for item in items:
+            for prov in item.prov:
+                if prov.page_no != page_no:
+                    # Only process provenances for this specific page
+                    continue
+
+                category_id = self._category_name_to_id[item.label]
+                bbox: List[int] = list(
+                    prov.bbox.to_top_left_origin(page_height=page_height).as_tuple()
+                )
+                resolutions.append(LayoutResolution(category_id=category_id, bbox=bbox))
+        return resolutions
+
+    def _collect_items_by_page(
+        self,
+        doc: DoclingDocument,
+    ) -> Dict[int, List[DocItem]]:
+        """
+        Collect DocItems by page number for the given document and filter labels.
+
+        Args:
+            doc: The DoclingDocument to process
+
+        Returns:
+            Dictionary mapping page numbers to lists of DocItems
+        """
+        pages_to_objects: Dict[int, List[DocItem]] = defaultdict(list)
+
+        for item, level in doc.iterate_items(
+            included_content_layers={
+                c for c in ContentLayer if c != ContentLayer.BACKGROUND
+            },
+            traverse_pictures=True,
+            with_groups=True,
+        ):
+            if isinstance(item, DocItem):
+                for prov in item.prov:
+                    pages_to_objects[prov.page_no].append(item)
+
+        return pages_to_objects
+
+    def _get_pred_doc(
+        self, data_record: DatasetRecordWithPrediction
+    ) -> Optional[DoclingDocument]:
+        r"""
+        Get the predicted DoclingDocument
+        """
+        # TODO: Duplicated code from LayoutEvaluator
+        pred_doc = None
+        for prediction_format in self._prediction_sources:
+            if prediction_format == PredictionFormats.DOCLING_DOCUMENT:
+                pred_doc = data_record.predicted_doc
+            elif prediction_format == PredictionFormats.JSON:
+                if data_record.original_prediction:
+                    pred_doc = DoclingDocument.load_from_json(
+                        data_record.original_prediction
+                    )
+            elif prediction_format == PredictionFormats.YAML:
+                if data_record.original_prediction:
+                    pred_doc = DoclingDocument.load_from_yaml(
+                        data_record.original_prediction
+                    )
+            elif prediction_format == PredictionFormats.DOCTAGS:
+                pred_doc = docling_document_from_doctags(data_record)
+            if pred_doc is not None:
+                break
+
+        return pred_doc

From a324d6f20134adde968591028234a68a57113875 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 13 Nov 2025 13:19:50 +0100
Subject: [PATCH 03/15] fix: PixelLayoutEvaluator: Fix the matrix categories
 mappings. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../evaluators/pixel_layout_evaluator.py      | 105 +++++++++++++-----
 tests/test_pixel_layout_evaluator.py          |  37 ++++++
 2 files changed, 117 insertions(+), 25 deletions(-)
 create mode 100644 tests/test_pixel_layout_evaluator.py

diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 1d434b2e..6b3f37c7 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -14,6 +14,7 @@
     DoclingDocument,
 )
 from docling_core.types.doc.labels import DocItemLabel
+from docling_ibm_models.layoutmodel.labels import LayoutLabels
 from tqdm import tqdm  # type: ignore
 
 from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
@@ -32,6 +33,12 @@
 _log = logging.getLogger(__name__)
 
 
+def category_name_to_docitemlabel(category_name: str) -> DocItemLabel:
+    r""" """
+    label = DocItemLabel(category_name.lower().replace(" ", "_").replace("-", "_"))
+    return label
+
+
 class PixelLayoutEvaluator(BaseEvaluator):
     r"""
     Evaluate the document layout by computing a pixel-level confusion matrix and derivative matrices
@@ -71,35 +78,83 @@ def __init__(
         # Initialize the multi label confusion matrix calculator
         self._mlcm = MultiLabelConfusionMatrix(validation_mode="disabled")
 
-        self._set_categories(label_mapping)
+        # Initialize the mappings between DocItemLabel <-> category_id <-> category_name
+        self._matrix_doclabelitem_to_id: Dict[
+            DocItemLabel, int
+        ]  # DocLabelItem to cat_id (shifted to make space for Background)
+        self._matrix_id_to_name: Dict[
+            int, str
+        ]  # shifted cat_id to string (to include Background)
+        self._matrix_doclabelitem_to_id, self._matrix_id_to_name = (
+            self._build_matrix_categories(label_mapping)
+        )
 
-    def _set_categories(
+    def _build_matrix_categories(
         self,
         label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
-    ):
+    ) -> Tuple[
+        Dict[DocItemLabel, int],
+        Dict[int, str],
+    ]:
         r"""
-        Set the categories index and reversed index
+        Create mappings for the matrix categories including the background (shifted) while taking
+        into account the label_mappings:
+
+        Returns:
+        --------
+        matrix_doclabelitem_to_id: Dict[DocItemLabel, int]
+            From DocItemLabel to shifted category_id (the values do NOT contain zero)
+            If the label_mapping maps to None, this entry is omitted
+
+        matrix_id_to_name: Dict[int, str]
+            From shifted_category_id to string.
+            For key==0 the value is Background, otherwise the value of the corresponding DocItemLabel
+            taking into account any label mapping
+            If the label_mapping maps to None, this entry is omitted
+
         """
-        label_to_id: dict[str, int] = {
-            label: i for i, label in enumerate(DEFAULT_EXPORT_LABELS)
+        layout_labels = LayoutLabels()
+
+        # Auxiliary mapping: DocItemLabel -> canonical_category_id
+        canonical_to_id: Dict[str, int] = layout_labels.canonical_to_int()
+        label_to_id: Dict[DocItemLabel, int] = {
+            DocItemLabel(cat_name.lower().replace(" ", "_").replace("-", "_")): cat_id
+            for cat_name, cat_id in canonical_to_id.items()
         }
 
-        self._category_name_to_id: Dict[str, int] = {}
-        if label_mapping:
-            for label in DEFAULT_EXPORT_LABELS:
-                if label in label_mapping:
-                    mapped_label = label_mapping.get(label)
-                    if not mapped_label:  # Skip a label that maps to None
+        # Populate the matrix_doclabelitem_to_id
+        matrix_doclabelitem_to_id: Dict[DocItemLabel, int] = (
+            {}
+        )  # The values are shifted (not including zero)
+        label_id_offset = 1
+
+        # TODO: If label_mappings are provided, we end up having more than one DocItemLabel with the same cat_id
+        for label, canonical_cat_id in label_to_id.items():
+            effective_label = label
+            if label_mapping and label in label_mapping:
+                effective_label = label_mapping.get(label)
+                if not effective_label:  # Skip labels that map to None
+                    continue
+            matrix_doclabelitem_to_id[label] = (
+                label_to_id[effective_label] + label_id_offset
+            )
+
+        # Populate the matrix_id_to_name
+        matrix_id_to_name: Dict[int, str] = {}  # The keys start from 0 to include BG
+        shifted_canonical: Dict[int, str] = layout_labels.shifted_canonical_categories()
+
+        # TODO: If label_mappings are provided we end up having more than 1 cat_id with the same name
+        for shifted_cat_id, cat_name in shifted_canonical.items():
+            label = None
+            if cat_name != shifted_canonical[0]:
+                label = category_name_to_docitemlabel(cat_name)
+                if label_mapping and label in label_mapping:
+                    label = label_mapping.get(label)
+                    if not label:  # Skip labels that map to None
                         continue
-                    self._category_name_to_id[label] = label_to_id[mapped_label]
-                else:
-                    self._category_name_to_id[label] = label_to_id[label]
-        else:
-            self._category_name_to_id = label_to_id
+            matrix_id_to_name[shifted_cat_id] = label.value if label else cat_name
 
-        self._category_id_to_name: Dict[int, str] = {
-            cat_id: cat_name for cat_name, cat_id in self._category_name_to_id.items()
-        }
+        return matrix_doclabelitem_to_id, matrix_id_to_name
 
     def __call__(
         self,
@@ -126,7 +181,7 @@ def __call__(
         }
         doc_stats: Dict[str, Dict[str, int]] = {}
 
-        matrix_categories_ids: List[int] = list(self._category_id_to_name.keys())
+        matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
         num_categories = len(matrix_categories_ids)
         confusion_matrix_sum = np.zeros((num_categories, num_categories))
 
@@ -161,7 +216,7 @@ def __call__(
         # TODO: Compute metrics
         ds_metrics = self._mlcm.compute_metrics(
             confusion_matrix_sum,
-            self._category_id_to_name,
+            self._matrix_id_to_name,
             True,
         )
 
@@ -184,7 +239,7 @@ def _compute_document_confusion_matrix(
         pred_pages = set(pred_pages_to_objects.keys())
         _log.debug(f"GT pages: {sorted(gt_pages)}, Pred pages: {sorted(pred_pages)}")
 
-        matrix_categories_ids: List[int] = list(self._category_id_to_name.keys())
+        matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
         num_categories = len(matrix_categories_ids)
         off_diagonal_cells = num_categories * num_categories - num_categories
         confusion_matrix_sum = np.zeros((num_categories, num_categories))
@@ -229,7 +284,7 @@ def _compute_document_confusion_matrix(
                     self._missing_prediction_strategy
                     == MissingPredictionStrategy.PENALIZE
                 ):
-                    # Create a penalty confusion matrix
+                    # Create a penalty confusion matrix by distributing all pixels outside of diagonal
                     image_pixels = pg_width * pg_height
                     penalty_value = image_pixels / off_diagonal_cells
                     confusion_matrix_sum += penalty_value * (
@@ -268,7 +323,7 @@ def _get_page_layout_resolution(
                     # Only process provenances for this specific page
                     continue
 
-                category_id = self._category_name_to_id[item.label]
+                category_id = self._matrix_doclabelitem_to_id[item.label]
                 bbox: List[int] = list(
                     prov.bbox.to_top_left_origin(page_height=page_height).as_tuple()
                 )
diff --git a/tests/test_pixel_layout_evaluator.py b/tests/test_pixel_layout_evaluator.py
new file mode 100644
index 00000000..be9d7036
--- /dev/null
+++ b/tests/test_pixel_layout_evaluator.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+from typing import Dict, Optional
+
+import pytest
+from docling.datamodel.base_models import ConversionStatus
+from docling_core.types.doc.labels import DocItemLabel
+
+from docling_eval.datamodels.types import PredictionFormats
+from docling_eval.evaluators.markdown_text_evaluator import MarkdownTextEvaluator
+from docling_eval.evaluators.pixel_layout_evaluator import PixelLayoutEvaluator
+
+
+@pytest.mark.dependency(
+    depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
+    scope="session",
+)
+def test_layout_evaluator():
+    r""" """
+    test_dataset_dir = Path("scratch/DPBench/eval_dataset_e2e")
+
+    # Default evaluator
+    eval1 = PixelLayoutEvaluator()
+    # v1 = eval1(test_dataset_dir)
+    # assert v1 is not None
+
+    # Custom label mappings
+    label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = {
+        DocItemLabel.CAPTION: DocItemLabel.TITLE,
+        DocItemLabel.DOCUMENT_INDEX: None,
+    }
+    eval2 = PixelLayoutEvaluator(label_mapping=label_mapping)
+    # v2 = eval2(test_dataset_dir)
+    # assert v2 is not None
+
+
+if __name__ == "__main__":
+    test_layout_evaluator()

From dc5206c7ef99d06fc775f5008be6d6aaf9383c7e Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 13 Nov 2025 16:34:45 +0100
Subject: [PATCH 04/15] feat: PixelLayoutEvaluator: Introduce pydantic types as
 transfer objects between the PixelLayoutEvaluator and
 MultiLabelConfusionMatrix. Stabilize the outputs. Still need to work on
 save/export.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../pixel/confusion_matrix_exporter.py        |  59 ---------
 .../pixel/multi_label_confusion_matrix.py     |  80 ++++++++----
 .../evaluators/pixel_layout_evaluator.py      | 116 +++++++++++++-----
 tests/test_pixel_layout_evaluator.py          |   4 +-
 4 files changed, 147 insertions(+), 112 deletions(-)

diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
index 718f0537..12687137 100644
--- a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -63,28 +63,6 @@ def __init__(
             bottom=Side(border_style=border_style, color=border_color),
         )
 
-    def export_excel_from_json(
-        self,
-        save_path: Path,
-        pixel_evaluations_fn: Path,
-    ):
-        r""" """
-        with open(pixel_evaluations_fn, "r") as fd:
-            pixel_evaluations = json.load(fd)
-
-        # Reconsturct the confusion matrix
-        confusion_matrix_as_list = pixel_evaluations["confusion_matrix"]
-        confusion_matrix = np.asarray(confusion_matrix_as_list, dtype=np.float32)
-
-        # Reconstruct the headers
-        class_names: dict[int, str] = pixel_evaluations["classes_names"]
-        headers = list(class_names.values())
-        save_path.mkdir(parents=True, exist_ok=True)
-        excel_fn = save_path / f"{pixel_evaluations_fn.stem}.xlsx"
-
-        # TODO:
-        # self._export_matrix_to_excel(confusion_matrix, headers, excel_fn)
-
     def build_ds_report(
         self,
         model_name: str,
@@ -508,40 +486,3 @@ def _adjust_column_widths(self, ws: Worksheet):
                 val = str(cell.value)
                 max_length = max(max_length, len(val))
             ws.column_dimensions[col_letter].width = max_length + 2
-
-
-def main():
-    r""" """
-    # Parse CLI arguments
-    parser = argparse.ArgumentParser(
-        description="Run the PixelLayoutEvaluator with the GT in COCO-format and the predictions in COCO-tools format"
-    )
-    parser.add_argument(
-        "-s",
-        "--save_dir",
-        type=Path,
-        required=True,
-        help="Root save directory to save the exported files",
-    )
-    parser.add_argument(
-        "-p",
-        "--pixel_evaluations",
-        type=Path,
-        required=True,
-        help="Json with the pixel evaluations",
-    )
-    args = parser.parse_args()
-    save_dir = args.save_dir
-    pixel_eval_fn = args.pixel_evaluations
-
-    # Configure logger
-    log_format = "%(asctime)s - %(levelname)s - %(message)s"
-    logging.basicConfig(level=logging.INFO, format=log_format)
-
-    # Initialize the exporter
-    exporter = ConfusionMatrixExporter()
-    exporter.export_excel_from_json(save_dir, pixel_eval_fn)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
index 658fa4dc..81619f53 100644
--- a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
+++ b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
@@ -17,6 +17,34 @@ class LayoutResolution(BaseModel):
     bbox: list[float]
 
 
+class MultiLabelMatrixAggMetrics(BaseModel):
+    classes_precision: dict[str, float]
+    classes_recall: dict[str, float]
+    classes_f1: dict[str, float]
+
+    classes_precision_mean: float
+    classes_recall_mean: float
+    classes_f1_mean: float
+
+
+class MultiLabelMatrixMetrics(BaseModel):
+    class Config:
+        arbitrary_types_allowed = True
+
+    confusion_matrix: np.ndarray
+
+    precision_matrix: np.ndarray
+    recall_matrix: np.ndarray
+    f1_matrix: np.ndarray
+
+    agg_metrics: MultiLabelMatrixAggMetrics
+
+
+class MultiLabelMatrixEvaluation(BaseModel):
+    detailed_metrics: MultiLabelMatrixMetrics
+    colapsed_metrics: Optional[MultiLabelMatrixMetrics] = None
+
+
 def unpackbits(x: np.ndarray, num_bits: int):
     r"""
     Unpack num_bits bits of each element of the numpy array x
@@ -312,21 +340,21 @@ def compute_metrics(
         confusion_matrix: np.ndarray,
         class_names: dict[int, str],
         colapse_non_bg: bool = False,
-    ) -> dict:
+    ) -> MultiLabelMatrixEvaluation:
         r"""
         Parameters:
         -----------
         confusion_matrix: np.ndarray[num_categories + 1, num_categories + 1]
         class_names: Mapping from class_id to class_names
         colapse_non_bg: Colapse all classes except of the first one that is assumed to be the BG
+
+        Returns
+        --------
+
         """
         # Compute metrics on the full confusion matrix
-        all_classes_metrics = self._compute_metrics_on_confusion(
-            confusion_matrix, class_names
-        )
-        metrics = {
-            MultiLabelConfusionMatrix.DETAILED_METRICS_KEY: all_classes_metrics,
-        }
+        detailed_metrics = self._compute_matrix_metrics(confusion_matrix, class_names)
+        evaluation = MultiLabelMatrixEvaluation(detailed_metrics=detailed_metrics)
 
         if colapse_non_bg:
             # Colapse the classes except the background and compute metrics again
@@ -340,19 +368,20 @@ def compute_metrics(
                 0: class_names[0],
                 1: MultiLabelConfusionMatrix.ALL_COLAPSED_CLASSES_NAME,
             }
-            colapsed_metrics = self._compute_metrics_on_confusion(
+            colapsed_metrics = self._compute_matrix_metrics(
                 colapsed_confusion_matrix,
                 colapsed_class_names,
             )
-            metrics[MultiLabelConfusionMatrix.COLAPSED_METRICS_KEY] = colapsed_metrics
+            evaluation.colapsed_metrics = colapsed_metrics
 
-        return metrics
+        return evaluation
 
-    def _compute_metrics_on_confusion(
+    def _compute_matrix_metrics(
         self,
         confusion_matrix: np.ndarray,
         class_names: dict[int, str],
-    ):
+    ) -> MultiLabelMatrixMetrics:
+        r""" """
         col_sums = np.sum(confusion_matrix, axis=0)
         row_sums = np.sum(confusion_matrix, axis=1)
 
@@ -399,17 +428,22 @@ def array_to_dict(a: np.ndarray) -> dict[str, float]:
         recall_dict = array_to_dict(recall)
         f1_dict = array_to_dict(f1)
 
-        metrics = {
-            "precision_matrix": precision_matrix,
-            "recall_matrix": recall_matrix,
-            "f1_matrix": f1_matrix,
-            "classes_precision": precision_dict,
-            "classes_recall": recall_dict,
-            "classes_f1": f1_dict,
-            "classes_precision_mean": float(precision_mean),
-            "classes_recall_mean": float(recall_mean),
-            "classes_f1_mean": float(f1_mean),
-        }
+        agg_metrics = MultiLabelMatrixAggMetrics(
+            classes_precision=precision_dict,
+            classes_recall=recall_dict,
+            classes_f1=f1_dict,
+            classes_precision_mean=float(precision_mean),
+            classes_recall_mean=float(recall_mean),
+            classes_f1_mean=float(f1_mean),
+        )
+
+        metrics = MultiLabelMatrixMetrics(
+            confusion_matrix=confusion_matrix,
+            precision_matrix=precision_matrix,
+            recall_matrix=recall_matrix,
+            f1_matrix=f1_matrix,
+            agg_metrics=agg_metrics,
+        )
         return metrics
 
     def _validate_contributions(
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 6b3f37c7..a13aae14 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -1,5 +1,6 @@
 import glob
 import logging
+import math
 from collections import defaultdict
 from enum import Enum
 from pathlib import Path
@@ -7,14 +8,10 @@
 
 import numpy as np
 from datasets import Dataset, load_dataset
-from docling_core.types.doc.document import (
-    DEFAULT_EXPORT_LABELS,
-    ContentLayer,
-    DocItem,
-    DoclingDocument,
-)
+from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
 from docling_core.types.doc.labels import DocItemLabel
 from docling_ibm_models.layoutmodel.labels import LayoutLabels
+from pydantic import BaseModel
 from tqdm import tqdm  # type: ignore
 
 from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
@@ -28,11 +25,29 @@
 from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
     LayoutResolution,
     MultiLabelConfusionMatrix,
+    MultiLabelMatrixAggMetrics,
+    MultiLabelMatrixEvaluation,
 )
 
 _log = logging.getLogger(__name__)
 
 
+class PagePixelLayoutEvaluation(BaseModel):
+    doc_id: str
+    page_no: int
+    detailed_metrics: MultiLabelMatrixAggMetrics
+    colapsed_metrics: Optional[MultiLabelMatrixAggMetrics] = None
+
+
+class DatasetPixelLayoutEvaluation(BaseModel):
+    num_pages: int
+    num_pixels: int
+    rejected_samples: Dict[EvaluationRejectionType, int]
+    detailed_metrics: MultiLabelMatrixAggMetrics
+    colapsed_metrics: Optional[MultiLabelMatrixAggMetrics] = None
+    page_evaluations: List[PagePixelLayoutEvaluation]
+
+
 def category_name_to_docitemlabel(category_name: str) -> DocItemLabel:
     r""" """
     label = DocItemLabel(category_name.lower().replace(" ", "_").replace("-", "_"))
@@ -160,7 +175,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
-    ):
+    ) -> Tuple[DatasetPixelLayoutEvaluation, MultiLabelMatrixEvaluation]:
         _log.info("Loading the split '%s' from: '%s'", split, ds_path)
 
         # Load the dataset
@@ -179,11 +194,11 @@ def __call__(
             EvaluationRejectionType.MISSING_PREDICTION: 0,
             EvaluationRejectionType.MISMATHCED_DOCUMENT: 0,
         }
-        doc_stats: Dict[str, Dict[str, int]] = {}
-
         matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
         num_categories = len(matrix_categories_ids)
-        confusion_matrix_sum = np.zeros((num_categories, num_categories))
+        ds_confusion_matrix = np.zeros((num_categories, num_categories))
+        all_pages_evaluations: List[PagePixelLayoutEvaluation] = []
+        ds_num_pixels = 0
 
         for i, data in tqdm(
             enumerate(ds_selection),
@@ -192,7 +207,7 @@ def __call__(
             total=len(ds_selection),
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
-            doc_id = data_record.doc_id
+            doc_id: str = data_record.doc_id
             if data_record.status not in self._accepted_status:
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
@@ -207,24 +222,67 @@ def __call__(
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
                 continue
 
-            # TODO: Check in the end if to optimize the memory allocation for the intermediate CMs
-            doc_cm = self._compute_document_confusion_matrix(true_doc, pred_doc)
+            # Compute confusion matrices
+            pages_confusion_matrices, num_pixels = (
+                self._compute_document_confusion_matrix(true_doc, pred_doc)
+            )
+
+            # Compute metrics per page
+            for page_no, page_confusion_matrix in pages_confusion_matrices.items():
+                ds_confusion_matrix += page_confusion_matrix
+                page_metrics = self._mlcm.compute_metrics(
+                    page_confusion_matrix,
+                    self._matrix_id_to_name,
+                    True,
+                )
+                page_evaluation = PagePixelLayoutEvaluation(
+                    doc_id=doc_id,
+                    page_no=page_no,
+                    detailed_metrics=page_metrics.detailed_metrics,
+                    colapsed_metrics=page_metrics.colapsed_metrics,
+                )
+                all_pages_evaluations.append(page_evaluation)
 
-            # TODO: Check if to compute metrics per document
-            confusion_matrix_sum += doc_cm
+            ds_num_pixels += num_pixels
 
-        # TODO: Compute metrics
-        ds_metrics = self._mlcm.compute_metrics(
-            confusion_matrix_sum,
+        # Compute metrics for the dataset and each document
+        ds_matrix_evaluation: MultiLabelMatrixEvaluation = self._mlcm.compute_metrics(
+            ds_confusion_matrix,
             self._matrix_id_to_name,
             True,
         )
 
+        ds_evaluation = DatasetPixelLayoutEvaluation(
+            num_pages=len(all_pages_evaluations),
+            num_pixels=num_pixels,
+            rejected_samples=rejected_samples,
+            detailed_metrics=ds_matrix_evaluation.detailed_metrics,
+            colapsed_metrics=ds_matrix_evaluation.colapsed_metrics,
+            page_evaluations=all_pages_evaluations,
+        )
+
+        return ds_evaluation, ds_matrix_evaluation
+
+    def save_evaluations(
+        self,
+        ds_evaluation: DatasetPixelLayoutEvaluation,
+        ds_matrix_evaluation: MultiLabelMatrixEvaluation,
+        save_root: Path,
+        excel_reports: bool = True,
+    ):
+        r"""
+        Save all evaluations as jsons and excel reports
+        """
+        pass
+
     def _compute_document_confusion_matrix(
         self,
         true_doc: DoclingDocument,
         pred_doc: DoclingDocument,
-    ) -> np.ndarray:
+    ) -> Tuple[
+        Dict[int, np.ndarray],  # page_no -> page confusion matrix
+        int,  # num_pixels
+    ]:
         r"""
         Compute the confusion matrix for the given documents.
         This is the sum of the confusion matrices of the document pages.
@@ -242,15 +300,13 @@ def _compute_document_confusion_matrix(
         matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
         num_categories = len(matrix_categories_ids)
         off_diagonal_cells = num_categories * num_categories - num_categories
-        confusion_matrix_sum = np.zeros((num_categories, num_categories))
-        # num_images = 0
-        # num_pixels = 0
-        # all_image_metrics: dict[str, dict] = {}  # image_filename -> image_metrics
+        page_confusion_matrices: Dict[int, np.ndarray] = {}
+        num_pixels = 0
 
         for page_no in sorted(gt_pages):
             page_size = true_doc.pages[page_no].size
-            pg_width = page_size.width
-            pg_height = page_size.height
+            pg_width = math.ceil(page_size.width)
+            pg_height = math.ceil(page_size.height)
 
             # Always process GT for this page
             gt_layouts = self._get_page_layout_resolution(
@@ -275,9 +331,11 @@ def _compute_document_confusion_matrix(
                 preds_binary = self._mlcm.make_binary_representation(
                     pg_width, pg_height, pred_layouts
                 )
-                confusion_matrix_sum += self._mlcm.generate_confusion_matrix(
+                page_confusion_matrix = self._mlcm.generate_confusion_matrix(
                     gt_binary, preds_binary, matrix_categories_ids
                 )
+                num_pixels += pg_width * pg_height
+                page_confusion_matrices[page_no] = page_confusion_matrix
             else:
                 # No prediction data for this page
                 if (
@@ -287,10 +345,12 @@ def _compute_document_confusion_matrix(
                     # Create a penalty confusion matrix by distributing all pixels outside of diagonal
                     image_pixels = pg_width * pg_height
                     penalty_value = image_pixels / off_diagonal_cells
-                    confusion_matrix_sum += penalty_value * (
+                    page_confusion_matrix = penalty_value * (
                         np.ones((num_categories, num_categories))
                         - np.eye(num_categories)
                     )
+                    num_pixels += image_pixels
+                    page_confusion_matrices[page_no] = page_confusion_matrix
                 elif (
                     self._missing_prediction_strategy
                     == MissingPredictionStrategy.IGNORE
@@ -301,7 +361,7 @@ def _compute_document_confusion_matrix(
                     raise ValueError(
                         f"Unknown missing prediction strategy: {self._missing_prediction_strategy}"
                     )
-        return confusion_matrix_sum
+        return page_confusion_matrices, num_pixels
 
     def _get_page_layout_resolution(
         self,
diff --git a/tests/test_pixel_layout_evaluator.py b/tests/test_pixel_layout_evaluator.py
index be9d7036..c3ea19cc 100644
--- a/tests/test_pixel_layout_evaluator.py
+++ b/tests/test_pixel_layout_evaluator.py
@@ -20,8 +20,8 @@ def test_layout_evaluator():
 
     # Default evaluator
     eval1 = PixelLayoutEvaluator()
-    # v1 = eval1(test_dataset_dir)
-    # assert v1 is not None
+    v1 = eval1(test_dataset_dir)
+    assert v1 is not None
 
     # Custom label mappings
     label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = {

From 0a2e728e12666774544a25141e34ea709bb5f42b Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 13 Nov 2025 20:17:21 +0100
Subject: [PATCH 05/15] feat: WIP: Implementing the
 PixelLayoutEvaluator::save_evaluations(). Allow the pydantic types to
 serialize numpy arrays as lists. Integrate with main.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                      |  19 ++-
 .../pixel/confusion_matrix_exporter.py        |   2 +
 .../pixel/multi_label_confusion_matrix.py     |  16 ++-
 .../evaluators/pixel_layout_evaluator.py      | 114 ++++++++++++++----
 tests/test_pixel_layout_evaluator.py          |  25 ++--
 5 files changed, 140 insertions(+), 36 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 388bfa6b..922a0c25 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -96,6 +96,13 @@
     OCREvaluator,
     OCRVisualizer,
 )
+from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
+    MultiLabelMatrixEvaluation,
+)
+from docling_eval.evaluators.pixel_layout_evaluator import (
+    DatasetPixelLayoutEvaluation,
+    PixelLayoutEvaluator,
+)
 from docling_eval.evaluators.readingorder_evaluator import (
     DatasetReadingOrderEvaluation,
     ReadingOrderEvaluator,
@@ -668,7 +675,17 @@ def evaluate(
         with open(save_fn, "w") as fd:
             json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
-        # TODO: Add also the pixel-wise layout evaluation
+        # Evaluate with the pixel-wise layout evaluation
+        pixel_layout_evaluator = PixelLayoutEvaluator()
+        pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
+            idir, split=split
+        )
+        pixel_save_root: Path = save_fn.parent / "pixel_layout_evaluations"
+        pixel_layout_evaluator.save_evaluations(
+            benchmark,
+            pixel_ds_evaluation,
+            pixel_save_root,
+        )
 
     elif modality == EvaluationModality.TABLE_STRUCTURE:
         table_evaluator = TableEvaluator()
diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
index 12687137..e5d482b2 100644
--- a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -76,6 +76,7 @@ def build_ds_report(
         excel_fn: Path,
         visualisations_root: Optional[Path],
     ):
+        # TODO: The new design produces pydantic objects for metrics instead of dicts
         r"""
         Generate excel report for the full dataset
         """
@@ -238,6 +239,7 @@ def _build_base_report(
         headers: list[str],
         confusion_matrix: np.ndarray,
         metrics: dict,
+        # matrix_metrics:  MultiLabelMatrixEvaluation,
         startrow: int = 0,
         hide_zero_rows: bool = True,
         hide_zero_cols: bool = True,
diff --git a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
index 81619f53..0586dab5 100644
--- a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
+++ b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
@@ -1,9 +1,9 @@
 import logging
 import math
-from typing import Optional
+from typing import Any, Optional
 
 import numpy as np
-from pydantic import BaseModel
+from pydantic import BaseModel, model_serializer
 
 _log = logging.getLogger(__name__)
 
@@ -28,17 +28,23 @@ class MultiLabelMatrixAggMetrics(BaseModel):
 
 
 class MultiLabelMatrixMetrics(BaseModel):
-    class Config:
-        arbitrary_types_allowed = True
+    model_config = {"arbitrary_types_allowed": True}
 
     confusion_matrix: np.ndarray
-
     precision_matrix: np.ndarray
     recall_matrix: np.ndarray
     f1_matrix: np.ndarray
 
     agg_metrics: MultiLabelMatrixAggMetrics
 
+    @model_serializer(mode="wrap")
+    def serialize_model(self, serializer: Any) -> dict:
+        data = serializer(self)
+        for field_name, field_value in self.__dict__.items():
+            if isinstance(field_value, np.ndarray):
+                data[field_name] = field_value.tolist()
+        return data
+
 
 class MultiLabelMatrixEvaluation(BaseModel):
     detailed_metrics: MultiLabelMatrixMetrics
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index a13aae14..c8d4380c 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import logging
 import math
 from collections import defaultdict
@@ -15,17 +16,23 @@
 from tqdm import tqdm  # type: ignore
 
 from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
-from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats
+from docling_eval.datamodels.types import (
+    BenchMarkColumns,
+    BenchMarkNames,
+    PredictionFormats,
+)
 from docling_eval.evaluators.base_evaluator import (
     BaseEvaluator,
     EvaluationRejectionType,
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.layout_evaluator import MissingPredictionStrategy
+from docling_eval.evaluators.pixel.confusion_matrix_exporter import (
+    ConfusionMatrixExporter,
+)
 from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
     LayoutResolution,
     MultiLabelConfusionMatrix,
-    MultiLabelMatrixAggMetrics,
     MultiLabelMatrixEvaluation,
 )
 
@@ -35,17 +42,15 @@
 class PagePixelLayoutEvaluation(BaseModel):
     doc_id: str
     page_no: int
-    detailed_metrics: MultiLabelMatrixAggMetrics
-    colapsed_metrics: Optional[MultiLabelMatrixAggMetrics] = None
+    matrix_evaluation: MultiLabelMatrixEvaluation
 
 
 class DatasetPixelLayoutEvaluation(BaseModel):
     num_pages: int
     num_pixels: int
     rejected_samples: Dict[EvaluationRejectionType, int]
-    detailed_metrics: MultiLabelMatrixAggMetrics
-    colapsed_metrics: Optional[MultiLabelMatrixAggMetrics] = None
-    page_evaluations: List[PagePixelLayoutEvaluation]
+    matrix_evaluation: MultiLabelMatrixEvaluation
+    page_evaluations: Dict[str, PagePixelLayoutEvaluation]
 
 
 def category_name_to_docitemlabel(category_name: str) -> DocItemLabel:
@@ -175,7 +180,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
-    ) -> Tuple[DatasetPixelLayoutEvaluation, MultiLabelMatrixEvaluation]:
+    ) -> DatasetPixelLayoutEvaluation:
         _log.info("Loading the split '%s' from: '%s'", split, ds_path)
 
         # Load the dataset
@@ -197,12 +202,14 @@ def __call__(
         matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
         num_categories = len(matrix_categories_ids)
         ds_confusion_matrix = np.zeros((num_categories, num_categories))
-        all_pages_evaluations: List[PagePixelLayoutEvaluation] = []
+        all_pages_evaluations: Dict[str, PagePixelLayoutEvaluation] = (
+            {}
+        )  # Key is doc_id-page-no
         ds_num_pixels = 0
 
         for i, data in tqdm(
             enumerate(ds_selection),
-            desc="Layout evaluations",
+            desc="Multi-label Matrix Layout evaluations",
             ncols=120,
             total=len(ds_selection),
         ):
@@ -223,25 +230,31 @@ def __call__(
                 continue
 
             # Compute confusion matrices
+            pages_confusion_matrices: Dict[int, np.ndarray]
             pages_confusion_matrices, num_pixels = (
                 self._compute_document_confusion_matrix(true_doc, pred_doc)
             )
 
             # Compute metrics per page
             for page_no, page_confusion_matrix in pages_confusion_matrices.items():
+                # Contribute to the dataset's confusion matrix
                 ds_confusion_matrix += page_confusion_matrix
-                page_metrics = self._mlcm.compute_metrics(
-                    page_confusion_matrix,
-                    self._matrix_id_to_name,
-                    True,
+
+                # Compute page metrics
+                page_matrix_evaluation: MultiLabelMatrixEvaluation = (
+                    self._mlcm.compute_metrics(
+                        page_confusion_matrix,
+                        self._matrix_id_to_name,
+                        True,
+                    )
                 )
                 page_evaluation = PagePixelLayoutEvaluation(
                     doc_id=doc_id,
                     page_no=page_no,
-                    detailed_metrics=page_metrics.detailed_metrics,
-                    colapsed_metrics=page_metrics.colapsed_metrics,
+                    matrix_evaluation=page_matrix_evaluation,
                 )
-                all_pages_evaluations.append(page_evaluation)
+                doc_page_id = f"{doc_id}-{page_no}"
+                all_pages_evaluations[doc_page_id] = page_evaluation
 
             ds_num_pixels += num_pixels
 
@@ -256,24 +269,79 @@ def __call__(
             num_pages=len(all_pages_evaluations),
             num_pixels=num_pixels,
             rejected_samples=rejected_samples,
-            detailed_metrics=ds_matrix_evaluation.detailed_metrics,
-            colapsed_metrics=ds_matrix_evaluation.colapsed_metrics,
+            matrix_evaluation=ds_matrix_evaluation,
             page_evaluations=all_pages_evaluations,
         )
 
-        return ds_evaluation, ds_matrix_evaluation
+        return ds_evaluation
 
     def save_evaluations(
         self,
+        benchmark: BenchMarkNames,
         ds_evaluation: DatasetPixelLayoutEvaluation,
-        ds_matrix_evaluation: MultiLabelMatrixEvaluation,
         save_root: Path,
-        excel_reports: bool = True,
+        export_excel_reports: bool = True,
     ):
         r"""
         Save all evaluations as jsons and excel reports
         """
-        pass
+        save_root.mkdir(parents=True, exist_ok=True)
+
+        # Save the dataset evaluation as a json
+        json_fn = save_root / f"evaluation_{benchmark.value}_pixel_layout.json"
+        with open(json_fn, "w") as fd:
+            json.dump(ds_evaluation.model_dump(), fd, indent=2, sort_keys=True)
+
+        # Export excel reports
+        if not export_excel_reports:
+            return
+
+        excel_exporter = ConfusionMatrixExporter()
+        model_name = ""  # TODO: Check if it is possible to find the layout model used in predictions
+        headers = list(
+            self._matrix_id_to_name.values()
+        )  # TODO: Duplicate values may appear due to label_mappings
+        ds_confusion_matrix = (
+            ds_evaluation.matrix_evaluation.detailed_metrics.confusion_matrix
+        )
+        colapsed_headers: list[str] = [
+            f"{metric}: {cell}"
+            for metric in ["Precision(GT/Pred)", "Recall(GT/Pred)", "F1(GT/Pred)"]
+            for cell in [
+                "BG/BG",
+                "BG/cls",
+                "cls/BG",
+                "cls/cls",
+            ]
+        ]
+        image_colapsed_aggs: Dict[str, np.ndarray] = {}
+        bg_cls_name = self._matrix_id_to_name[0]
+        for doc_page_id, page_evaluations in ds_evaluation.page_evaluations.items():
+            pm = page_evaluations.matrix_evaluation.colapsed_metrics
+            if not pm:
+                continue
+            # [12,]
+            image_colapsed_vector = np.stack(
+                [
+                    pm.precision_matrix.flatten(),
+                    pm.recall_matrix.flatten(),
+                    pm.f1_matrix.flatten(),
+                ],
+                axis=0,
+            ).flatten()
+            image_colapsed_aggs[doc_page_id] = image_colapsed_vector
+
+        excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_layout.xlsx"
+
+        # excel_exporter.build_ds_report(
+        #     model_name,
+        #     ds_evaluation.num_pages,
+        #     ds_evaluation.num_pixels,
+        #     headers,
+        #     ds_confusion_matrix,
+        #     colapsed_headers,
+        #     excel_fn,
+        # )
 
     def _compute_document_confusion_matrix(
         self,
diff --git a/tests/test_pixel_layout_evaluator.py b/tests/test_pixel_layout_evaluator.py
index c3ea19cc..57a0a797 100644
--- a/tests/test_pixel_layout_evaluator.py
+++ b/tests/test_pixel_layout_evaluator.py
@@ -5,9 +5,15 @@
 from docling.datamodel.base_models import ConversionStatus
 from docling_core.types.doc.labels import DocItemLabel
 
-from docling_eval.datamodels.types import PredictionFormats
+from docling_eval.datamodels.types import BenchMarkNames, PredictionFormats
 from docling_eval.evaluators.markdown_text_evaluator import MarkdownTextEvaluator
-from docling_eval.evaluators.pixel_layout_evaluator import PixelLayoutEvaluator
+from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
+    MultiLabelMatrixEvaluation,
+)
+from docling_eval.evaluators.pixel_layout_evaluator import (
+    DatasetPixelLayoutEvaluation,
+    PixelLayoutEvaluator,
+)
 
 
 @pytest.mark.dependency(
@@ -18,10 +24,8 @@ def test_layout_evaluator():
     r""" """
     test_dataset_dir = Path("scratch/DPBench/eval_dataset_e2e")
 
-    # Default evaluator
+    # Initialize default evaluator
     eval1 = PixelLayoutEvaluator()
-    v1 = eval1(test_dataset_dir)
-    assert v1 is not None
 
     # Custom label mappings
     label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = {
@@ -29,8 +33,15 @@ def test_layout_evaluator():
         DocItemLabel.DOCUMENT_INDEX: None,
     }
     eval2 = PixelLayoutEvaluator(label_mapping=label_mapping)
-    # v2 = eval2(test_dataset_dir)
-    # assert v2 is not None
+
+    # Save the evaluations
+    pixel_ds_evaluation: DatasetPixelLayoutEvaluation = eval1(test_dataset_dir)
+    pixel_save_root: Path = test_dataset_dir / "pixel_layout_evaluations"
+    eval1.save_evaluations(
+        BenchMarkNames.DPBENCH,
+        pixel_ds_evaluation,
+        pixel_save_root,
+    )
 
 
 if __name__ == "__main__":

From 54917ad72c81be3925ad7fa37212c4b5a55908e9 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 13 Nov 2025 20:21:11 +0100
Subject: [PATCH 06/15] chore: Move all pixel layout evaluation pydantic
 transfer objects in pixel_types.py

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                      |  8 +--
 .../pixel/multi_label_confusion_matrix.py     | 53 +++-------------
 docling_eval/evaluators/pixel/pixel_types.py  | 63 +++++++++++++++++++
 .../evaluators/pixel_layout_evaluator.py      | 21 ++-----
 tests/test_pixel_layout_evaluator.py          |  8 +--
 5 files changed, 82 insertions(+), 71 deletions(-)
 create mode 100644 docling_eval/evaluators/pixel/pixel_types.py

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 922a0c25..7c429509 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -96,13 +96,11 @@
     OCREvaluator,
     OCRVisualizer,
 )
-from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
-    MultiLabelMatrixEvaluation,
-)
-from docling_eval.evaluators.pixel_layout_evaluator import (
+from docling_eval.evaluators.pixel.pixel_types import (
     DatasetPixelLayoutEvaluation,
-    PixelLayoutEvaluator,
+    MultiLabelMatrixEvaluation,
 )
+from docling_eval.evaluators.pixel_layout_evaluator import PixelLayoutEvaluator
 from docling_eval.evaluators.readingorder_evaluator import (
     DatasetReadingOrderEvaluation,
     ReadingOrderEvaluator,
diff --git a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
index 0586dab5..bd407e46 100644
--- a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
+++ b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
@@ -1,54 +1,17 @@
 import logging
 import math
-from typing import Any, Optional
+from typing import Optional
 
 import numpy as np
-from pydantic import BaseModel, model_serializer
 
-_log = logging.getLogger(__name__)
-
-
-class LayoutResolution(BaseModel):
-    r"""Single bbox resolution"""
-
-    category_id: int
-
-    # bbox coords: (x1, y1, x2, y2) with the origin(0, 0) at the top, left corner, no normalization
-    bbox: list[float]
-
-
-class MultiLabelMatrixAggMetrics(BaseModel):
-    classes_precision: dict[str, float]
-    classes_recall: dict[str, float]
-    classes_f1: dict[str, float]
-
-    classes_precision_mean: float
-    classes_recall_mean: float
-    classes_f1_mean: float
+from docling_eval.evaluators.pixel.pixel_types import (
+    LayoutResolution,
+    MultiLabelMatrixAggMetrics,
+    MultiLabelMatrixEvaluation,
+    MultiLabelMatrixMetrics,
+)
 
-
-class MultiLabelMatrixMetrics(BaseModel):
-    model_config = {"arbitrary_types_allowed": True}
-
-    confusion_matrix: np.ndarray
-    precision_matrix: np.ndarray
-    recall_matrix: np.ndarray
-    f1_matrix: np.ndarray
-
-    agg_metrics: MultiLabelMatrixAggMetrics
-
-    @model_serializer(mode="wrap")
-    def serialize_model(self, serializer: Any) -> dict:
-        data = serializer(self)
-        for field_name, field_value in self.__dict__.items():
-            if isinstance(field_value, np.ndarray):
-                data[field_name] = field_value.tolist()
-        return data
-
-
-class MultiLabelMatrixEvaluation(BaseModel):
-    detailed_metrics: MultiLabelMatrixMetrics
-    colapsed_metrics: Optional[MultiLabelMatrixMetrics] = None
+_log = logging.getLogger(__name__)
 
 
 def unpackbits(x: np.ndarray, num_bits: int):
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
new file mode 100644
index 00000000..eaca5b65
--- /dev/null
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -0,0 +1,63 @@
+from typing import Any, Dict, Optional
+
+import numpy as np
+from pydantic import BaseModel, model_serializer
+
+from docling_eval.evaluators.base_evaluator import EvaluationRejectionType
+
+
+class LayoutResolution(BaseModel):
+    r"""Single bbox resolution"""
+
+    category_id: int
+
+    # bbox coords: (x1, y1, x2, y2) with the origin(0, 0) at the top, left corner, no normalization
+    bbox: list[float]
+
+
+class MultiLabelMatrixAggMetrics(BaseModel):
+    classes_precision: dict[str, float]
+    classes_recall: dict[str, float]
+    classes_f1: dict[str, float]
+
+    classes_precision_mean: float
+    classes_recall_mean: float
+    classes_f1_mean: float
+
+
+class MultiLabelMatrixMetrics(BaseModel):
+    model_config = {"arbitrary_types_allowed": True}
+
+    confusion_matrix: np.ndarray
+    precision_matrix: np.ndarray
+    recall_matrix: np.ndarray
+    f1_matrix: np.ndarray
+
+    agg_metrics: MultiLabelMatrixAggMetrics
+
+    @model_serializer(mode="wrap")
+    def serialize_model(self, serializer: Any) -> dict:
+        data = serializer(self)
+        for field_name, field_value in self.__dict__.items():
+            if isinstance(field_value, np.ndarray):
+                data[field_name] = field_value.tolist()
+        return data
+
+
+class MultiLabelMatrixEvaluation(BaseModel):
+    detailed_metrics: MultiLabelMatrixMetrics
+    colapsed_metrics: Optional[MultiLabelMatrixMetrics] = None
+
+
+class PagePixelLayoutEvaluation(BaseModel):
+    doc_id: str
+    page_no: int
+    matrix_evaluation: MultiLabelMatrixEvaluation
+
+
+class DatasetPixelLayoutEvaluation(BaseModel):
+    num_pages: int
+    num_pixels: int
+    rejected_samples: Dict[EvaluationRejectionType, int]
+    matrix_evaluation: MultiLabelMatrixEvaluation
+    page_evaluations: Dict[str, PagePixelLayoutEvaluation]
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index c8d4380c..2a054139 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -12,7 +12,6 @@
 from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
 from docling_core.types.doc.labels import DocItemLabel
 from docling_ibm_models.layoutmodel.labels import LayoutLabels
-from pydantic import BaseModel
 from tqdm import tqdm  # type: ignore
 
 from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
@@ -31,28 +30,18 @@
     ConfusionMatrixExporter,
 )
 from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
-    LayoutResolution,
     MultiLabelConfusionMatrix,
+)
+from docling_eval.evaluators.pixel.pixel_types import (
+    DatasetPixelLayoutEvaluation,
+    LayoutResolution,
     MultiLabelMatrixEvaluation,
+    PagePixelLayoutEvaluation,
 )
 
 _log = logging.getLogger(__name__)
 
 
-class PagePixelLayoutEvaluation(BaseModel):
-    doc_id: str
-    page_no: int
-    matrix_evaluation: MultiLabelMatrixEvaluation
-
-
-class DatasetPixelLayoutEvaluation(BaseModel):
-    num_pages: int
-    num_pixels: int
-    rejected_samples: Dict[EvaluationRejectionType, int]
-    matrix_evaluation: MultiLabelMatrixEvaluation
-    page_evaluations: Dict[str, PagePixelLayoutEvaluation]
-
-
 def category_name_to_docitemlabel(category_name: str) -> DocItemLabel:
     r""" """
     label = DocItemLabel(category_name.lower().replace(" ", "_").replace("-", "_"))
diff --git a/tests/test_pixel_layout_evaluator.py b/tests/test_pixel_layout_evaluator.py
index 57a0a797..fb1aaab3 100644
--- a/tests/test_pixel_layout_evaluator.py
+++ b/tests/test_pixel_layout_evaluator.py
@@ -7,13 +7,11 @@
 
 from docling_eval.datamodels.types import BenchMarkNames, PredictionFormats
 from docling_eval.evaluators.markdown_text_evaluator import MarkdownTextEvaluator
-from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
-    MultiLabelMatrixEvaluation,
-)
-from docling_eval.evaluators.pixel_layout_evaluator import (
+from docling_eval.evaluators.pixel.pixel_types import (
     DatasetPixelLayoutEvaluation,
-    PixelLayoutEvaluator,
+    MultiLabelMatrixEvaluation,
 )
+from docling_eval.evaluators.pixel_layout_evaluator import PixelLayoutEvaluator
 
 
 @pytest.mark.dependency(

From 7630760a13d25f77c940eda1003bb91c9ed84ed7 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 13 Nov 2025 20:54:46 +0100
Subject: [PATCH 07/15] feat: PixelLayoutEvaluator seems to work on DPBench.
 Matrices are generated and excel export works. Additional testing of the API
 and CLI is needed

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../pixel/confusion_matrix_exporter.py        | 115 +++++++++++-------
 .../pixel/multi_label_confusion_matrix.py     |  40 +++---
 docling_eval/evaluators/pixel/pixel_types.py  |   6 +-
 .../evaluators/pixel_layout_evaluator.py      |  27 ++--
 4 files changed, 103 insertions(+), 85 deletions(-)

diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
index e5d482b2..3e025590 100644
--- a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -17,6 +17,7 @@
 from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
     MultiLabelConfusionMatrix,
 )
+from docling_eval.evaluators.pixel.pixel_types import MultiLabelMatrixEvaluation
 
 _log = logging.getLogger(__name__)
 
@@ -39,6 +40,47 @@ def power_norm(x, x_min, x_max, p=0.3):
     return x**p
 
 
+def discover_filename_prefix(
+    root: Path, filename_extension: str, scan_depth: int = 10
+) -> Optional[str]:
+    r"""
+    Discover the common prefix used in the filenames of the prediction visualisations
+    Check up to scan_depth files in the dir
+
+    Return
+    -------
+    empty string: There is no common prefix
+    None: Cannot reach consensus in any common prefix
+    """
+
+    def common_prefix(a: str, b: str, stop_char="_"):
+        r"""String common prefix with stop char"""
+        prefix: list[str] = []
+        for c1, c2 in zip(a, b):
+            if c1 != c2:
+                break
+            prefix.append(c1)
+            if c1 == stop_char:
+                break
+        return "".join(prefix)
+
+    prev_image_filename = None
+    prefix = ""
+    for i, image_fn in enumerate(root.glob(f"*.{filename_extension}")):
+        if i >= scan_depth:
+            break
+        image_filename = image_fn.name
+        if prev_image_filename:
+            # new_prefix will be the empty string if there is nothing in common
+            new_prefix = common_prefix(image_filename, prev_image_filename)
+            if prefix == "":
+                prefix = new_prefix
+            elif new_prefix != "" and prefix != new_prefix:
+                return None
+        prev_image_filename = image_filename
+    return prefix
+
+
 class ConfusionMatrixExporter:
     r""" """
 
@@ -69,14 +111,12 @@ def build_ds_report(
         num_images: int,
         num_pixels: int,
         headers: list[str],
-        confusion_matrix: np.ndarray,
-        ds_metrics: dict,
+        matrix_evaluation: MultiLabelMatrixEvaluation,
         colapsed_headers: list[str],
         image_colaped_aggs: dict[str, np.ndarray],
         excel_fn: Path,
-        visualisations_root: Optional[Path],
+        visualisations_root: Optional[Path] = None,
     ):
-        # TODO: The new design produces pydantic objects for metrics instead of dicts
         r"""
         Generate excel report for the full dataset
         """
@@ -102,8 +142,7 @@ def build_ds_report(
                 writer,
                 ConfusionMatrixExporter.DATASET_WORKSHEET_NAME,
                 headers,
-                confusion_matrix,
-                ds_metrics,
+                matrix_evaluation,
                 4,
             )
 
@@ -126,8 +165,7 @@ def build_ds_report(
     def build_image_report(
         self,
         headers: list[str],
-        confusion_matrix: np.ndarray,
-        metrics: dict,
+        matrix_evaluation: MultiLabelMatrixEvaluation,
         excel_fn: Path,
     ):
         with pd.ExcelWriter(excel_fn, engine="openpyxl") as writer:
@@ -135,8 +173,7 @@ def build_image_report(
                 writer,
                 ConfusionMatrixExporter.DATASET_WORKSHEET_NAME,
                 headers,
-                confusion_matrix,
-                metrics,
+                matrix_evaluation,
             )
 
             # Adjust column widths
@@ -180,22 +217,22 @@ def _aggregate_colapsed_image_metrics(
         ws: Worksheet = wb[worksheet_name]
 
         # Set the prediction visualisations as hyperlinks in the image filenames
-        # if visualisations_root:
-        #     viz_prefix = discover_filename_prefix(visualisations_root, "png")
-        #     if viz_prefix:
-        #         col = startcol + 1
-        #         for i, image_filename in enumerate(image_colapsed_aggs.keys()):
-        #             row = i + startrow + 2
-        #             cell = ws.cell(row=row, column=col)
-        #             viz_fn = visualisations_root / f"{viz_prefix}{image_filename}"
-        #             if not viz_fn.is_file():
-        #                 continue
-        #             cell.hyperlink = str(viz_fn)
-        #             cell.style = "Hyperlink"
-        #     else:
-        #         _log.error(
-        #             "Cannot the visualisation prefix in: %s", str(visualisations_root)
-        #         )
+        if visualisations_root:
+            viz_prefix = discover_filename_prefix(visualisations_root, "png")
+            if viz_prefix:
+                col = startcol + 1
+                for i, image_filename in enumerate(image_colapsed_aggs.keys()):
+                    row = i + startrow + 2
+                    cell = ws.cell(row=row, column=col)
+                    viz_fn = visualisations_root / f"{viz_prefix}{image_filename}"
+                    if not viz_fn.is_file():
+                        continue
+                    cell.hyperlink = str(viz_fn)
+                    cell.style = "Hyperlink"
+            else:
+                _log.error(
+                    "Cannot the visualisation prefix in: %s", str(visualisations_root)
+                )
 
         # Set the subtitle
         subtitle_cell = ws.cell(
@@ -237,9 +274,7 @@ def _build_base_report(
         writer: ExcelWriter,
         worksheet_name: str,
         headers: list[str],
-        confusion_matrix: np.ndarray,
-        metrics: dict,
-        # matrix_metrics:  MultiLabelMatrixEvaluation,
+        matrix_evaluation: MultiLabelMatrixEvaluation,
         startrow: int = 0,
         hide_zero_rows: bool = True,
         hide_zero_cols: bool = True,
@@ -260,7 +295,7 @@ def _build_base_report(
             writer,
             worksheet_name,
             "Confusion Matrix",
-            confusion_matrix,
+            matrix_evaluation.detailed.confusion_matrix,
             headers,
             decimal_digits=3,
             origin_cell=(startrow, 0),
@@ -272,14 +307,11 @@ def _build_base_report(
         # Add the precision matrix with detailed classes
         detailed_precision_row = max_row + detailed_spacing
         colapsed_precision_row = max_row + colapsed_spacing
-        detailed_precision_matrix: np.ndarray = metrics[
-            MultiLabelConfusionMatrix.DETAILED_METRICS_KEY
-        ]["precision_matrix"]
         max_row, max_col = self._export_matrix_to_excel(
             writer,
             worksheet_name,
             "Precision Matrix",
-            detailed_precision_matrix,
+            matrix_evaluation.detailed.precision_matrix,
             headers,
             decimal_digits=3,
             origin_cell=(detailed_precision_row, 0),
@@ -292,14 +324,11 @@ def _build_base_report(
         colapsed_col = max_col + 1
 
         # Add the precision matrix with colapsed classes
-        colapsed_precision_matrix: np.ndarray = metrics[
-            MultiLabelConfusionMatrix.COLAPSED_METRICS_KEY
-        ]["precision_matrix"]
         self._export_matrix_to_excel(
             writer,
             worksheet_name,
             "Colapsed Precision Matrix",
-            colapsed_precision_matrix,
+            matrix_evaluation.colapsed.precision_matrix,
             colapsed_headers,
             decimal_digits=3,
             origin_cell=(colapsed_precision_row, colapsed_col),
@@ -309,14 +338,11 @@ def _build_base_report(
         )
 
         # Add the recall matrix with detailed classes
-        detailed_recall_matrix: np.ndarray = metrics[
-            MultiLabelConfusionMatrix.DETAILED_METRICS_KEY
-        ]["recall_matrix"]
         max_row, max_col = self._export_matrix_to_excel(
             writer,
             worksheet_name,
             "Recall matrix",
-            detailed_recall_matrix,
+            matrix_evaluation.detailed.recall_matrix,
             headers,
             decimal_digits=3,
             origin_cell=(detailed_recall_row, 0),
@@ -326,14 +352,11 @@ def _build_base_report(
         )
 
         # Add the recall matrix with colapsed classes
-        colapsed_recall_matrix: np.ndarray = metrics[
-            MultiLabelConfusionMatrix.COLAPSED_METRICS_KEY
-        ]["recall_matrix"]
         self._export_matrix_to_excel(
             writer,
             worksheet_name,
             "Colapsed Recall Matrix",
-            colapsed_recall_matrix,
+            matrix_evaluation.colapsed.recall_matrix,
             colapsed_headers,
             decimal_digits=3,
             origin_cell=(colapsed_recall_row, colapsed_col),
diff --git a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
index bd407e46..1d87a8a3 100644
--- a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
+++ b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
@@ -308,7 +308,6 @@ def compute_metrics(
         self,
         confusion_matrix: np.ndarray,
         class_names: dict[int, str],
-        colapse_non_bg: bool = False,
     ) -> MultiLabelMatrixEvaluation:
         r"""
         Parameters:
@@ -323,25 +322,26 @@ def compute_metrics(
         """
         # Compute metrics on the full confusion matrix
         detailed_metrics = self._compute_matrix_metrics(confusion_matrix, class_names)
-        evaluation = MultiLabelMatrixEvaluation(detailed_metrics=detailed_metrics)
-
-        if colapse_non_bg:
-            # Colapse the classes except the background and compute metrics again
-            colapsed_confusion_matrix = np.asarray(
-                [
-                    [confusion_matrix[0, 0], np.sum(confusion_matrix[0, 1:])],
-                    [np.sum(confusion_matrix[1:, 0]), np.sum(confusion_matrix[1:, 1:])],
-                ]
-            )
-            colapsed_class_names = {
-                0: class_names[0],
-                1: MultiLabelConfusionMatrix.ALL_COLAPSED_CLASSES_NAME,
-            }
-            colapsed_metrics = self._compute_matrix_metrics(
-                colapsed_confusion_matrix,
-                colapsed_class_names,
-            )
-            evaluation.colapsed_metrics = colapsed_metrics
+
+        # Colapse the classes except the background and compute metrics again
+        colapsed_confusion_matrix = np.asarray(
+            [
+                [confusion_matrix[0, 0], np.sum(confusion_matrix[0, 1:])],
+                [np.sum(confusion_matrix[1:, 0]), np.sum(confusion_matrix[1:, 1:])],
+            ]
+        )
+        colapsed_class_names = {
+            0: class_names[0],
+            1: MultiLabelConfusionMatrix.ALL_COLAPSED_CLASSES_NAME,
+        }
+        colapsed_metrics = self._compute_matrix_metrics(
+            colapsed_confusion_matrix,
+            colapsed_class_names,
+        )
+
+        evaluation = MultiLabelMatrixEvaluation(
+            detailed=detailed_metrics, colapsed=colapsed_metrics
+        )
 
         return evaluation
 
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
index eaca5b65..2b782dea 100644
--- a/docling_eval/evaluators/pixel/pixel_types.py
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 import numpy as np
 from pydantic import BaseModel, model_serializer
@@ -45,8 +45,8 @@ def serialize_model(self, serializer: Any) -> dict:
 
 
 class MultiLabelMatrixEvaluation(BaseModel):
-    detailed_metrics: MultiLabelMatrixMetrics
-    colapsed_metrics: Optional[MultiLabelMatrixMetrics] = None
+    detailed: MultiLabelMatrixMetrics
+    colapsed: MultiLabelMatrixMetrics
 
 
 class PagePixelLayoutEvaluation(BaseModel):
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 2a054139..05863282 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -234,7 +234,6 @@ def __call__(
                     self._mlcm.compute_metrics(
                         page_confusion_matrix,
                         self._matrix_id_to_name,
-                        True,
                     )
                 )
                 page_evaluation = PagePixelLayoutEvaluation(
@@ -251,7 +250,6 @@ def __call__(
         ds_matrix_evaluation: MultiLabelMatrixEvaluation = self._mlcm.compute_metrics(
             ds_confusion_matrix,
             self._matrix_id_to_name,
-            True,
         )
 
         ds_evaluation = DatasetPixelLayoutEvaluation(
@@ -290,9 +288,6 @@ def save_evaluations(
         headers = list(
             self._matrix_id_to_name.values()
         )  # TODO: Duplicate values may appear due to label_mappings
-        ds_confusion_matrix = (
-            ds_evaluation.matrix_evaluation.detailed_metrics.confusion_matrix
-        )
         colapsed_headers: list[str] = [
             f"{metric}: {cell}"
             for metric in ["Precision(GT/Pred)", "Recall(GT/Pred)", "F1(GT/Pred)"]
@@ -304,9 +299,8 @@ def save_evaluations(
             ]
         ]
         image_colapsed_aggs: Dict[str, np.ndarray] = {}
-        bg_cls_name = self._matrix_id_to_name[0]
         for doc_page_id, page_evaluations in ds_evaluation.page_evaluations.items():
-            pm = page_evaluations.matrix_evaluation.colapsed_metrics
+            pm = page_evaluations.matrix_evaluation.colapsed
             if not pm:
                 continue
             # [12,]
@@ -322,15 +316,16 @@ def save_evaluations(
 
         excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_layout.xlsx"
 
-        # excel_exporter.build_ds_report(
-        #     model_name,
-        #     ds_evaluation.num_pages,
-        #     ds_evaluation.num_pixels,
-        #     headers,
-        #     ds_confusion_matrix,
-        #     colapsed_headers,
-        #     excel_fn,
-        # )
+        excel_exporter.build_ds_report(
+            model_name,
+            ds_evaluation.num_pages,
+            ds_evaluation.num_pixels,
+            headers,
+            ds_evaluation.matrix_evaluation,
+            colapsed_headers,
+            image_colapsed_aggs,
+            excel_fn,
+        )
 
     def _compute_document_confusion_matrix(
         self,

From 249645ee017d57f963488231b3a82c91d4e4f9f1 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 13 Nov 2025 21:00:58 +0100
Subject: [PATCH 08/15] feat: Add class_names in MultiLabelMatrixMetrics

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/pixel/multi_label_confusion_matrix.py | 1 +
 docling_eval/evaluators/pixel/pixel_types.py                  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
index 1d87a8a3..f2a29e06 100644
--- a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
+++ b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
@@ -407,6 +407,7 @@ def array_to_dict(a: np.ndarray) -> dict[str, float]:
         )
 
         metrics = MultiLabelMatrixMetrics(
+            class_names=class_names,
             confusion_matrix=confusion_matrix,
             precision_matrix=precision_matrix,
             recall_matrix=recall_matrix,
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
index 2b782dea..d98912fc 100644
--- a/docling_eval/evaluators/pixel/pixel_types.py
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -28,6 +28,7 @@ class MultiLabelMatrixAggMetrics(BaseModel):
 class MultiLabelMatrixMetrics(BaseModel):
     model_config = {"arbitrary_types_allowed": True}
 
+    class_names: Dict[int, str]
     confusion_matrix: np.ndarray
     precision_matrix: np.ndarray
     recall_matrix: np.ndarray

From ed869fc09f001baff78909b88cbed525aa97c2f6 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 14 Nov 2025 10:54:44 +0100
Subject: [PATCH 09/15] chore: Improve tests for PixelLayoutEvaluator

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 tests/test_pixel_layout_evaluator.py | 69 +++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/tests/test_pixel_layout_evaluator.py b/tests/test_pixel_layout_evaluator.py
index fb1aaab3..99d60f9d 100644
--- a/tests/test_pixel_layout_evaluator.py
+++ b/tests/test_pixel_layout_evaluator.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Dict, Optional
 
+import numpy as np
 import pytest
 from docling.datamodel.base_models import ConversionStatus
 from docling_core.types.doc.labels import DocItemLabel
@@ -25,22 +26,66 @@ def test_layout_evaluator():
     # Initialize default evaluator
     eval1 = PixelLayoutEvaluator()
 
-    # Custom label mappings
-    label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = {
-        DocItemLabel.CAPTION: DocItemLabel.TITLE,
-        DocItemLabel.DOCUMENT_INDEX: None,
-    }
-    eval2 = PixelLayoutEvaluator(label_mapping=label_mapping)
+    # Peform the evaluation
+    evaluation: DatasetPixelLayoutEvaluation = eval1(test_dataset_dir)
+
+    # Generic assertions
+    assert evaluation is not None
+    for rejection_type, rejection_count in evaluation.rejected_samples.items():
+        assert (
+            rejection_count == 0
+        ), f"Unexpected rejections of type: {rejection_type.value}"
+
+    # Pixel evalution assertions
+    assert evaluation.num_pages == len(evaluation.page_evaluations)
+    detailed_class_names: dict[str, str] = (
+        evaluation.matrix_evaluation.detailed.class_names
+    )
+    num_classes = len(detailed_class_names)
+    confusion_matrix_list = evaluation.matrix_evaluation.detailed.confusion_matrix
+    detailed_confusion_matrix = np.asarray(confusion_matrix_list)
+    assert detailed_confusion_matrix.shape == (
+        num_classes,
+        num_classes,
+    ), "Wrong detailed confusion matrix dims"
+    colapsed_confusion_matrix_list = (
+        evaluation.matrix_evaluation.colapsed.confusion_matrix
+    )
+    colapsed_confusion_matrix = np.asarray(colapsed_confusion_matrix_list)
+    assert colapsed_confusion_matrix.shape == (
+        2,
+        2,
+    ), "Wrong colapsed confusion matrix dims"
 
-    # Save the evaluations
-    pixel_ds_evaluation: DatasetPixelLayoutEvaluation = eval1(test_dataset_dir)
-    pixel_save_root: Path = test_dataset_dir / "pixel_layout_evaluations"
+    # Save the evaluation
+    pixel_save_root = Path(
+        "scratch/DPBench/evaluations/layout/pixel_layout_evaluations"
+    )
     eval1.save_evaluations(
         BenchMarkNames.DPBENCH,
-        pixel_ds_evaluation,
+        evaluation,
         pixel_save_root,
     )
+    expected_json_fn = pixel_save_root / "evaluation_DPBench_pixel_layout.json"
+    expected_excel_fn = pixel_save_root / "evaluation_DPBench_pixel_layout.xlsx"
+    assert expected_json_fn.is_file(), "Missing evaluation json file"
+    assert expected_excel_fn.is_file(), "Missing evaluation excel file"
+
+    # Initialize with custom label mappings
+    label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = {
+        DocItemLabel.CAPTION: DocItemLabel.TITLE,
+        DocItemLabel.DOCUMENT_INDEX: None,
+    }
+    eval2 = PixelLayoutEvaluator(label_mapping=label_mapping)
+    assert len(eval2._matrix_doclabelitem_to_id) + 1 == len(eval2._matrix_id_to_name)
+    assert (
+        eval2._matrix_doclabelitem_to_id[DocItemLabel.CAPTION]
+        == eval2._matrix_doclabelitem_to_id[DocItemLabel.TITLE]
+    ), "Wrong label mapping in _matrix_doclabelitem_to_id"
+    assert (
+        DocItemLabel.CAPTION.value not in eval2._matrix_id_to_name.values()
+    ), "Wrong label mapping in _matrix_id_to_name"
 
 
-if __name__ == "__main__":
-    test_layout_evaluator()
+# if __name__ == "__main__":
+#     test_layout_evaluator()

From 9fc7e8c36554379050e88daf77fa9dceb61bb200 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 14 Nov 2025 11:47:10 +0100
Subject: [PATCH 10/15] feat: Make the DoclingPredictionProvider dump the full
 pipeline options in the `predictor_info` field, not only the default ones.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/prediction_providers/docling_provider.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docling_eval/prediction_providers/docling_provider.py b/docling_eval/prediction_providers/docling_provider.py
index 28b236dd..5a4dc72d 100644
--- a/docling_eval/prediction_providers/docling_provider.py
+++ b/docling_eval/prediction_providers/docling_provider.py
@@ -136,7 +136,7 @@ def info(self) -> Dict:
                     "pipeline_class": v.pipeline_cls.__name__,
                     "pipeline_options": (
                         v.pipeline_options.model_dump(
-                            mode="json", exclude_defaults=True
+                            mode="json", exclude_defaults=False
                         )
                         if v.pipeline_options is not None
                         else None  # Parquet might not like empty dicts!

From 5aea3247768373cb887b31b20e00c2a6e2e8c296 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 14 Nov 2025 11:51:57 +0100
Subject: [PATCH 11/15] feat: Extend PixelLayoutEvaluator to detect the name of
 the layout model and add it in the reports

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                      |  5 +---
 .../pixel/confusion_matrix_exporter.py        | 24 ++++++++++-------
 docling_eval/evaluators/pixel/pixel_types.py  |  3 ++-
 .../evaluators/pixel_layout_evaluator.py      | 26 ++++++++++++++-----
 docling_eval/utils/utils.py                   | 14 ++++++++++
 5 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 7c429509..52852784 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -96,10 +96,7 @@
     OCREvaluator,
     OCRVisualizer,
 )
-from docling_eval.evaluators.pixel.pixel_types import (
-    DatasetPixelLayoutEvaluation,
-    MultiLabelMatrixEvaluation,
-)
+from docling_eval.evaluators.pixel.pixel_types import DatasetPixelLayoutEvaluation
 from docling_eval.evaluators.pixel_layout_evaluator import PixelLayoutEvaluator
 from docling_eval.evaluators.readingorder_evaluator import (
     DatasetReadingOrderEvaluation,
diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
index 3e025590..c7b8d053 100644
--- a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -107,7 +107,6 @@ def __init__(
 
     def build_ds_report(
         self,
-        model_name: str,
         num_images: int,
         num_pixels: int,
         headers: list[str],
@@ -115,6 +114,7 @@ def build_ds_report(
         colapsed_headers: list[str],
         image_colaped_aggs: dict[str, np.ndarray],
         excel_fn: Path,
+        model_name: Optional[str] = None,
         visualisations_root: Optional[Path] = None,
     ):
         r"""
@@ -127,15 +127,19 @@ def build_ds_report(
                 wb.create_sheet(ConfusionMatrixExporter.DATASET_WORKSHEET_NAME)
                 wb.active = 0
             ds_ws: Worksheet = wb.active  # type: ignore
-            ds_ws.cell(row=1, column=1).value = model_name
-            ds_ws.cell(row=1, column=1).font = Font(
-                bold=True, size=ConfusionMatrixExporter.TITLE_FONT_SIZE
-            )
-            ds_ws.cell(row=2, column=1).value = "#images"
-            ds_ws.cell(row=2, column=2).value = num_images
-            ds_ws.cell(row=3, column=1).value = "#pixels"
-            ds_ws.cell(row=3, column=2).value = num_pixels
-            ds_ws.cell(row=3, column=2).number_format = f"#,##0"
+            header_row = 1
+            if model_name:
+                ds_ws.cell(row=header_row, column=1).value = model_name
+                ds_ws.cell(row=header_row, column=1).font = Font(
+                    bold=True, size=ConfusionMatrixExporter.TITLE_FONT_SIZE
+                )
+                header_row += 1
+            ds_ws.cell(row=header_row, column=1).value = "#images"
+            ds_ws.cell(row=header_row, column=2).value = num_images
+            header_row += 1
+            ds_ws.cell(row=header_row, column=1).value = "#pixels"
+            ds_ws.cell(row=header_row, column=2).value = num_pixels
+            ds_ws.cell(row=header_row, column=2).number_format = f"#,##0"
 
             # Build the basic report
             self._build_base_report(
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
index d98912fc..d38b6f9f 100644
--- a/docling_eval/evaluators/pixel/pixel_types.py
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import numpy as np
 from pydantic import BaseModel, model_serializer
@@ -57,6 +57,7 @@ class PagePixelLayoutEvaluation(BaseModel):
 
 
 class DatasetPixelLayoutEvaluation(BaseModel):
+    layout_model_name: Optional[str]
     num_pages: int
     num_pixels: int
     rejected_samples: Dict[EvaluationRejectionType, int]
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 05863282..621bf4bd 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -38,6 +38,7 @@
     MultiLabelMatrixEvaluation,
     PagePixelLayoutEvaluation,
 )
+from docling_eval.utils.utils import dict_get
 
 _log = logging.getLogger(__name__)
 
@@ -195,6 +196,7 @@ def __call__(
             {}
         )  # Key is doc_id-page-no
         ds_num_pixels = 0
+        self._layout_model_name = None
 
         for i, data in tqdm(
             enumerate(ds_selection),
@@ -203,6 +205,21 @@ def __call__(
             total=len(ds_selection),
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
+
+            # Try to extract the layout model name
+            if not self._layout_model_name:
+                self._layout_model_name = dict_get(
+                    data_record.predictor_info,
+                    [
+                        "options",
+                        "pdf",
+                        "pipeline_options",
+                        "layout_options",
+                        "model_spec",
+                        "name",
+                    ],
+                )
+
             doc_id: str = data_record.doc_id
             if data_record.status not in self._accepted_status:
                 _log.error(
@@ -253,6 +270,7 @@ def __call__(
         )
 
         ds_evaluation = DatasetPixelLayoutEvaluation(
+            layout_model_name=self._layout_model_name,
             num_pages=len(all_pages_evaluations),
             num_pixels=num_pixels,
             rejected_samples=rejected_samples,
@@ -284,10 +302,7 @@ def save_evaluations(
             return
 
         excel_exporter = ConfusionMatrixExporter()
-        model_name = ""  # TODO: Check if it is possible to find the layout model used in predictions
-        headers = list(
-            self._matrix_id_to_name.values()
-        )  # TODO: Duplicate values may appear due to label_mappings
+        headers = list(self._matrix_id_to_name.values())
         colapsed_headers: list[str] = [
             f"{metric}: {cell}"
             for metric in ["Precision(GT/Pred)", "Recall(GT/Pred)", "F1(GT/Pred)"]
@@ -317,7 +332,6 @@ def save_evaluations(
         excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_layout.xlsx"
 
         excel_exporter.build_ds_report(
-            model_name,
             ds_evaluation.num_pages,
             ds_evaluation.num_pixels,
             headers,
@@ -325,6 +339,7 @@ def save_evaluations(
             colapsed_headers,
             image_colapsed_aggs,
             excel_fn,
+            self._layout_model_name,
         )
 
     def _compute_document_confusion_matrix(
@@ -476,7 +491,6 @@ def _get_pred_doc(
         r"""
         Get the predicted DoclingDocument
         """
-        # TODO: Duplicated code from LayoutEvaluator
         pred_doc = None
         for prediction_format in self._prediction_sources:
             if prediction_format == PredictionFormats.DOCLING_DOCUMENT:
diff --git a/docling_eval/utils/utils.py b/docling_eval/utils/utils.py
index 26ff67c1..77b33310 100644
--- a/docling_eval/utils/utils.py
+++ b/docling_eval/utils/utils.py
@@ -776,3 +776,17 @@ def does_intersection_area_exceed_threshold(
         if first_bbox_area > 0
         else False
     )
+
+
+def dict_get(data: dict, keys: list[str], default=None):
+    r"""
+    Traverse the given path of keys and return the value of dict
+    If the path is broken return the default value
+    """
+    current = data
+    for key in keys:
+        if isinstance(current, dict) and key in current:
+            current = current[key]
+        else:
+            return default
+    return current

From 94c2376be699e45882fc59aed28db5db33936a55 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 14 Nov 2025 14:30:01 +0100
Subject: [PATCH 12/15] feat: Extend PixelLayoutEvaluator to include
 DatasetStatistics fields for the f1 scores of the pages with the full classes
 and the colapsed classes. Then use these fields in the visualisations to make
 histograms.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                      | 25 +++++++++-
 .../pixel/confusion_matrix_exporter.py        |  1 -
 docling_eval/evaluators/pixel/pixel_types.py  | 22 +++++++-
 .../evaluators/pixel_layout_evaluator.py      | 50 +++++++++++++++++--
 4 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 52852784..94bd0c19 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -675,7 +675,7 @@ def evaluate(
         pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
             idir, split=split
         )
-        pixel_save_root: Path = save_fn.parent / "pixel_layout_evaluations"
+        pixel_save_root: Path = save_fn.parent
         pixel_layout_evaluator.save_evaluations(
             benchmark,
             pixel_ds_evaluation,
@@ -905,6 +905,29 @@ def visualize(
             _log.info(content)
             with open(log_filename, "a") as fd:
                 fd.write(content)
+
+            #######################################################################################
+            # TODO: Process stats from the pixel_layout_evaluator
+            pixel_eval_fns = PixelLayoutEvaluator.evaluation_filenames(benchmark, odir)
+            pixel_json_fn = pixel_eval_fns["json"]
+            with open(pixel_json_fn, "r") as fd:
+                pixel_layout_evaluation = (
+                    DatasetPixelLayoutEvaluation.model_validate_json(fd.read())
+                )
+            log_and_save_stats(
+                odir,
+                benchmark,
+                modality,
+                "pixel_all_classes_f1",
+                pixel_layout_evaluation.f1_all_classes_stats,
+            )
+            log_and_save_stats(
+                odir,
+                benchmark,
+                modality,
+                "pixel_colapsed_classes_f1",
+                pixel_layout_evaluation.f1_colapsed_classes_stats,
+            )
         except Exception as e:
             _log.error(f"Error processing layout evaluation: {str(e)}")
 
diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
index c7b8d053..497e0f6f 100644
--- a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -13,7 +13,6 @@
 from openpyxl.worksheet.worksheet import Worksheet
 from pandas import ExcelWriter
 
-# from src.utils.utils import discover_filename_prefix
 from docling_eval.evaluators.pixel.multi_label_confusion_matrix import (
     MultiLabelConfusionMatrix,
 )
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
index d38b6f9f..0ccc53d6 100644
--- a/docling_eval/evaluators/pixel/pixel_types.py
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -1,9 +1,10 @@
 from typing import Any, Dict, Optional
 
 import numpy as np
-from pydantic import BaseModel, model_serializer
+from pydantic import BaseModel, model_serializer, model_validator
 
 from docling_eval.evaluators.base_evaluator import EvaluationRejectionType
+from docling_eval.evaluators.stats import DatasetStatistics
 
 
 class LayoutResolution(BaseModel):
@@ -44,6 +45,21 @@ def serialize_model(self, serializer: Any) -> dict:
                 data[field_name] = field_value.tolist()
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def deserialize_arrays(cls, data: Any) -> Any:
+        if isinstance(data, dict):
+            array_fields = [
+                "confusion_matrix",
+                "precision_matrix",
+                "recall_matrix",
+                "f1_matrix",
+            ]
+            for field_name in array_fields:
+                if field_name in data:
+                    data[field_name] = np.asarray(data[field_name])
+        return data
+
 
 class MultiLabelMatrixEvaluation(BaseModel):
     detailed: MultiLabelMatrixMetrics
@@ -63,3 +79,7 @@ class DatasetPixelLayoutEvaluation(BaseModel):
     rejected_samples: Dict[EvaluationRejectionType, int]
     matrix_evaluation: MultiLabelMatrixEvaluation
     page_evaluations: Dict[str, PagePixelLayoutEvaluation]
+
+    # Statistics across all images for f1 on all classes and on the colapsed classes
+    f1_all_classes_stats: DatasetStatistics
+    f1_colapsed_classes_stats: DatasetStatistics
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 621bf4bd..8f1a7b51 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -16,8 +16,8 @@
 
 from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
 from docling_eval.datamodels.types import (
-    BenchMarkColumns,
     BenchMarkNames,
+    EvaluationModality,
     PredictionFormats,
 )
 from docling_eval.evaluators.base_evaluator import (
@@ -38,6 +38,7 @@
     MultiLabelMatrixEvaluation,
     PagePixelLayoutEvaluation,
 )
+from docling_eval.evaluators.stats import compute_stats
 from docling_eval.utils.utils import dict_get
 
 _log = logging.getLogger(__name__)
@@ -55,6 +56,9 @@ class PixelLayoutEvaluator(BaseEvaluator):
     (precision, recall, f1).
     """
 
+    # TODO: Expose a low level API/CLI that allows to evaluate outside of DoclingDocuments with models
+    # that have totally different classes than GT, described as strings instead of DocItemLabel
+
     def __init__(
         self,
         label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
@@ -138,7 +142,7 @@ def _build_matrix_categories(
         )  # The values are shifted (not including zero)
         label_id_offset = 1
 
-        # TODO: If label_mappings are provided, we end up having more than one DocItemLabel with the same cat_id
+        # Notice: If label_mappings are provided, we end up having more than one DocItemLabel with the same cat_id
         for label, canonical_cat_id in label_to_id.items():
             effective_label = label
             if label_mapping and label in label_mapping:
@@ -153,7 +157,7 @@ def _build_matrix_categories(
         matrix_id_to_name: Dict[int, str] = {}  # The keys start from 0 to include BG
         shifted_canonical: Dict[int, str] = layout_labels.shifted_canonical_categories()
 
-        # TODO: If label_mappings are provided we end up having more than 1 cat_id with the same name
+        # Notice: If label_mappings are provided we end up having more than 1 cat_id with the same name
         for shifted_cat_id, cat_name in shifted_canonical.items():
             label = None
             if cat_name != shifted_canonical[0]:
@@ -197,6 +201,12 @@ def __call__(
         )  # Key is doc_id-page-no
         ds_num_pixels = 0
         self._layout_model_name = None
+        pages_detailed_f1: list[float] = (
+            []
+        )  # Gather f1 score/image when evaluated on all classes
+        pages_colapsed_f1: list[float] = (
+            []
+        )  # Gather f1 score/image when evaluated on colapsed classes
 
         for i, data in tqdm(
             enumerate(ds_selection),
@@ -261,6 +271,14 @@ def __call__(
                 doc_page_id = f"{doc_id}-{page_no}"
                 all_pages_evaluations[doc_page_id] = page_evaluation
 
+                # Update f1 lists
+                pages_detailed_f1.append(
+                    page_matrix_evaluation.detailed.agg_metrics.classes_f1_mean
+                )
+                pages_colapsed_f1.append(
+                    page_matrix_evaluation.colapsed.agg_metrics.classes_f1_mean
+                )
+
             ds_num_pixels += num_pixels
 
         # Compute metrics for the dataset and each document
@@ -276,10 +294,29 @@ def __call__(
             rejected_samples=rejected_samples,
             matrix_evaluation=ds_matrix_evaluation,
             page_evaluations=all_pages_evaluations,
+            f1_all_classes_stats=compute_stats(pages_detailed_f1),
+            f1_colapsed_classes_stats=compute_stats(pages_colapsed_f1),
         )
 
         return ds_evaluation
 
+    @staticmethod
+    def evaluation_filenames(
+        benchmark: BenchMarkNames, save_root: Path
+    ) -> dict[str, Path]:
+        r"""
+        Generate the expected filenames for the produced evaluation files
+        """
+        modality: str = EvaluationModality.LAYOUT.value
+        json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
+        excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
+
+        eval_filenames: dict[str, Path] = {
+            "json": json_fn,
+            "excel": excel_fn,
+        }
+        return eval_filenames
+
     def save_evaluations(
         self,
         benchmark: BenchMarkNames,
@@ -292,8 +329,11 @@ def save_evaluations(
         """
         save_root.mkdir(parents=True, exist_ok=True)
 
+        # Get the evaluation filenames
+        eval_fns = PixelLayoutEvaluator.evaluation_filenames(benchmark, save_root)
+
         # Save the dataset evaluation as a json
-        json_fn = save_root / f"evaluation_{benchmark.value}_pixel_layout.json"
+        json_fn = eval_fns["json"]
         with open(json_fn, "w") as fd:
             json.dump(ds_evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
@@ -329,7 +369,7 @@ def save_evaluations(
             ).flatten()
             image_colapsed_aggs[doc_page_id] = image_colapsed_vector
 
-        excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_layout.xlsx"
+        excel_fn = eval_fns["excel"]
 
         excel_exporter.build_ds_report(
             ds_evaluation.num_pages,

From ae0ad5dcfa5e030584679a16d5cc671ad40f87a9 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 14 Nov 2025 16:04:12 +0100
Subject: [PATCH 13/15] fix: PixelLayoutEvaluator: Convert num_pixels to uint64

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/pixel/confusion_matrix_exporter.py | 4 ++--
 docling_eval/evaluators/pixel/pixel_types.py               | 2 +-
 docling_eval/evaluators/pixel_layout_evaluator.py          | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
index 497e0f6f..798ffe6d 100644
--- a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -107,7 +107,7 @@ def __init__(
     def build_ds_report(
         self,
         num_images: int,
-        num_pixels: int,
+        num_pixels: np.uint64,
         headers: list[str],
         matrix_evaluation: MultiLabelMatrixEvaluation,
         colapsed_headers: list[str],
@@ -137,7 +137,7 @@ def build_ds_report(
             ds_ws.cell(row=header_row, column=2).value = num_images
             header_row += 1
             ds_ws.cell(row=header_row, column=1).value = "#pixels"
-            ds_ws.cell(row=header_row, column=2).value = num_pixels
+            ds_ws.cell(row=header_row, column=2).value = str(num_pixels)
             ds_ws.cell(row=header_row, column=2).number_format = f"#,##0"
 
             # Build the basic report
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
index 0ccc53d6..50ff25d4 100644
--- a/docling_eval/evaluators/pixel/pixel_types.py
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -75,7 +75,7 @@ class PagePixelLayoutEvaluation(BaseModel):
 class DatasetPixelLayoutEvaluation(BaseModel):
     layout_model_name: Optional[str]
     num_pages: int
-    num_pixels: int
+    num_pixels: np.uint64
     rejected_samples: Dict[EvaluationRejectionType, int]
     matrix_evaluation: MultiLabelMatrixEvaluation
     page_evaluations: Dict[str, PagePixelLayoutEvaluation]
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 8f1a7b51..71911c30 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -199,7 +199,7 @@ def __call__(
         all_pages_evaluations: Dict[str, PagePixelLayoutEvaluation] = (
             {}
         )  # Key is doc_id-page-no
-        ds_num_pixels = 0
+        ds_num_pixels: np.uint64 = np.uint64(0)
         self._layout_model_name = None
         pages_detailed_f1: list[float] = (
             []
@@ -388,7 +388,7 @@ def _compute_document_confusion_matrix(
         pred_doc: DoclingDocument,
     ) -> Tuple[
         Dict[int, np.ndarray],  # page_no -> page confusion matrix
-        int,  # num_pixels
+        np.uint64,  # num_pixels
     ]:
         r"""
         Compute the confusion matrix for the given documents.
@@ -408,7 +408,7 @@ def _compute_document_confusion_matrix(
         num_categories = len(matrix_categories_ids)
         off_diagonal_cells = num_categories * num_categories - num_categories
         page_confusion_matrices: Dict[int, np.ndarray] = {}
-        num_pixels = 0
+        num_pixels: np.uint64 = np.uint64(0)
 
         for page_no in sorted(gt_pages):
             page_size = true_doc.pages[page_no].size

From 6a4a6f340a64f357de74419ec7a2a3b7ec7ea3e1 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 14 Nov 2025 16:14:10 +0100
Subject: [PATCH 14/15] chore: Fix typos. Code comments

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                      |  7 +--
 .../pixel/confusion_matrix_exporter.py        | 60 ++++++++++---------
 .../pixel/multi_label_confusion_matrix.py     | 21 ++++---
 docling_eval/evaluators/pixel/pixel_types.py  |  8 +--
 .../evaluators/pixel_layout_evaluator.py      | 27 ++++-----
 5 files changed, 60 insertions(+), 63 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 94bd0c19..ade289da 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -906,8 +906,7 @@ def visualize(
             with open(log_filename, "a") as fd:
                 fd.write(content)
 
-            #######################################################################################
-            # TODO: Process stats from the pixel_layout_evaluator
+            # Process stats from the pixel_layout_evaluator
             pixel_eval_fns = PixelLayoutEvaluator.evaluation_filenames(benchmark, odir)
             pixel_json_fn = pixel_eval_fns["json"]
             with open(pixel_json_fn, "r") as fd:
@@ -925,8 +924,8 @@ def visualize(
                 odir,
                 benchmark,
                 modality,
-                "pixel_colapsed_classes_f1",
-                pixel_layout_evaluation.f1_colapsed_classes_stats,
+                "pixel_collapsed_classes_f1",
+                pixel_layout_evaluation.f1_collapsed_classes_stats,
             )
         except Exception as e:
             _log.error(f"Error processing layout evaluation: {str(e)}")
diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
index 798ffe6d..a6655ae0 100644
--- a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -110,8 +110,8 @@ def build_ds_report(
         num_pixels: np.uint64,
         headers: list[str],
         matrix_evaluation: MultiLabelMatrixEvaluation,
-        colapsed_headers: list[str],
-        image_colaped_aggs: dict[str, np.ndarray],
+        collapsed_headers: list[str],
+        image_collaped_aggs: dict[str, np.ndarray],
         excel_fn: Path,
         model_name: Optional[str] = None,
         visualisations_root: Optional[Path] = None,
@@ -149,12 +149,12 @@ def build_ds_report(
                 4,
             )
 
-            # Add the colapsed image metrics in a separate worksheet
-            self._aggregate_colapsed_image_metrics(
+            # Add the collapsed image metrics in a separate worksheet
+            self._aggregate_collapsed_image_metrics(
                 writer,
                 ConfusionMatrixExporter.IMAGES_WORKSHEET_NAME,
-                colapsed_headers,
-                image_colaped_aggs,
+                collapsed_headers,
+                image_collaped_aggs,
                 visualisations_root=visualisations_root,
             )
 
@@ -185,25 +185,25 @@ def build_image_report(
 
         _log.info("Image report: %s", str(excel_fn))
 
-    def _aggregate_colapsed_image_metrics(
+    def _aggregate_collapsed_image_metrics(
         self,
         writer: ExcelWriter,
         worksheet_name: str,
         headers: list[str],
-        image_colapsed_aggs: dict[str, np.ndarray],
+        image_collapsed_aggs: dict[str, np.ndarray],
         origin_cell: tuple[int, int] = (0, 0),
         decimal_digits: int = 3,
         visualisations_root: Optional[Path] = None,
     ):
         r"""
-        Aggregate all colapsed image metrics
+        Aggregate all collapsed image metrics
         """
         startrow = origin_cell[0] + 1
         startcol = origin_cell[1]
 
         # Build the dataframe
-        index = list(image_colapsed_aggs.keys())
-        data = np.stack(list(image_colapsed_aggs.values()), axis=0)  # [num_images, 12]
+        index = list(image_collapsed_aggs.keys())
+        data = np.stack(list(image_collapsed_aggs.values()), axis=0)  # [num_images, 12]
         data = np.round(data, decimals=3)
         df = pd.DataFrame(data, index=index, columns=headers)
 
@@ -224,7 +224,7 @@ def _aggregate_colapsed_image_metrics(
             viz_prefix = discover_filename_prefix(visualisations_root, "png")
             if viz_prefix:
                 col = startcol + 1
-                for i, image_filename in enumerate(image_colapsed_aggs.keys()):
+                for i, image_filename in enumerate(image_collapsed_aggs.keys()):
                     row = i + startrow + 2
                     cell = ws.cell(row=row, column=col)
                     viz_fn = visualisations_root / f"{viz_prefix}{image_filename}"
@@ -241,7 +241,7 @@ def _aggregate_colapsed_image_metrics(
         subtitle_cell = ws.cell(
             row=origin_cell[0] + 1, column=origin_cell[1] + 1
         )  # start from 1
-        subtitle_cell.value = "Image colapsed classes metrics"
+        subtitle_cell.value = "Image collapsed classes metrics"
         subtitle_cell.font = Font(
             bold=True, size=ConfusionMatrixExporter.SUBTITLE_FONT_SIZE
         )
@@ -286,11 +286,13 @@ def _build_base_report(
         Generate excel report for a single image
         """
         detailed_spacing = 4  # spacing between the detailed matrices
-        colapsed_spacing = 2  # spacing between a detailed and the next colapsed matrix
+        collapsed_spacing = (
+            2  # spacing between a detailed and the next collapsed matrix
+        )
 
-        colapsed_headers = [
+        collapsed_headers = [
             headers[0],
-            MultiLabelConfusionMatrix.ALL_COLAPSED_CLASSES_NAME,
+            MultiLabelConfusionMatrix.ALL_COLLAPSED_CLASSES_NAME,
         ]
 
         # Add the confusion matrix
@@ -309,7 +311,7 @@ def _build_base_report(
 
         # Add the precision matrix with detailed classes
         detailed_precision_row = max_row + detailed_spacing
-        colapsed_precision_row = max_row + colapsed_spacing
+        collapsed_precision_row = max_row + collapsed_spacing
         max_row, max_col = self._export_matrix_to_excel(
             writer,
             worksheet_name,
@@ -323,18 +325,18 @@ def _build_base_report(
             hide_zero_cols=hide_zero_cols,
         )
         detailed_recall_row = max_row + detailed_spacing
-        colapsed_recall_row = max_row + colapsed_spacing
-        colapsed_col = max_col + 1
+        collapsed_recall_row = max_row + collapsed_spacing
+        collapsed_col = max_col + 1
 
-        # Add the precision matrix with colapsed classes
+        # Add the precision matrix with collapsed classes
         self._export_matrix_to_excel(
             writer,
             worksheet_name,
-            "Colapsed Precision Matrix",
-            matrix_evaluation.colapsed.precision_matrix,
-            colapsed_headers,
+            "Collapsed Precision Matrix",
+            matrix_evaluation.collapsed.precision_matrix,
+            collapsed_headers,
             decimal_digits=3,
-            origin_cell=(colapsed_precision_row, colapsed_col),
+            origin_cell=(collapsed_precision_row, collapsed_col),
             normalization_func="linear",
             hide_zero_rows=hide_zero_rows,
             hide_zero_cols=hide_zero_cols,
@@ -354,15 +356,15 @@ def _build_base_report(
             hide_zero_cols=hide_zero_cols,
         )
 
-        # Add the recall matrix with colapsed classes
+        # Add the recall matrix with collapsed classes
         self._export_matrix_to_excel(
             writer,
             worksheet_name,
-            "Colapsed Recall Matrix",
-            matrix_evaluation.colapsed.recall_matrix,
-            colapsed_headers,
+            "Collapsed Recall Matrix",
+            matrix_evaluation.collapsed.recall_matrix,
+            collapsed_headers,
             decimal_digits=3,
-            origin_cell=(colapsed_recall_row, colapsed_col),
+            origin_cell=(collapsed_recall_row, collapsed_col),
             normalization_func="linear",
             hide_zero_rows=hide_zero_rows,
             hide_zero_cols=hide_zero_cols,
diff --git a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
index f2a29e06..2e2c4c8a 100644
--- a/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
+++ b/docling_eval/evaluators/pixel/multi_label_confusion_matrix.py
@@ -29,8 +29,8 @@ class MultiLabelConfusionMatrix:
     r""" """
 
     DETAILED_METRICS_KEY = "detailed_classes"
-    COLAPSED_METRICS_KEY = "colapsed_classes"
-    ALL_COLAPSED_CLASSES_NAME = "all_classes"
+    COLLAPSED_METRICS_KEY = "collapsed_classes"
+    ALL_COLLAPSED_CLASSES_NAME = "all_classes"
 
     def __init__(
         self,
@@ -314,7 +314,6 @@ def compute_metrics(
         -----------
         confusion_matrix: np.ndarray[num_categories + 1, num_categories + 1]
         class_names: Mapping from class_id to class_names
-        colapse_non_bg: Colapse all classes except of the first one that is assumed to be the BG
 
         Returns
         --------
@@ -323,24 +322,24 @@ def compute_metrics(
         # Compute metrics on the full confusion matrix
         detailed_metrics = self._compute_matrix_metrics(confusion_matrix, class_names)
 
-        # Colapse the classes except the background and compute metrics again
-        colapsed_confusion_matrix = np.asarray(
+        # Collapse the classes except the background and compute metrics again
+        collapsed_confusion_matrix = np.asarray(
             [
                 [confusion_matrix[0, 0], np.sum(confusion_matrix[0, 1:])],
                 [np.sum(confusion_matrix[1:, 0]), np.sum(confusion_matrix[1:, 1:])],
             ]
         )
-        colapsed_class_names = {
+        collapsed_class_names = {
             0: class_names[0],
-            1: MultiLabelConfusionMatrix.ALL_COLAPSED_CLASSES_NAME,
+            1: MultiLabelConfusionMatrix.ALL_COLLAPSED_CLASSES_NAME,
         }
-        colapsed_metrics = self._compute_matrix_metrics(
-            colapsed_confusion_matrix,
-            colapsed_class_names,
+        collapsed_metrics = self._compute_matrix_metrics(
+            collapsed_confusion_matrix,
+            collapsed_class_names,
         )
 
         evaluation = MultiLabelMatrixEvaluation(
-            detailed=detailed_metrics, colapsed=colapsed_metrics
+            detailed=detailed_metrics, collapsed=collapsed_metrics
         )
 
         return evaluation
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
index 50ff25d4..e813ca04 100644
--- a/docling_eval/evaluators/pixel/pixel_types.py
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -62,8 +62,8 @@ def deserialize_arrays(cls, data: Any) -> Any:
 
 
 class MultiLabelMatrixEvaluation(BaseModel):
-    detailed: MultiLabelMatrixMetrics
-    colapsed: MultiLabelMatrixMetrics
+    detailed: MultiLabelMatrixMetrics  # All classes included
+    collapsed: MultiLabelMatrixMetrics  # Only the background the other classes summed up together
 
 
 class PagePixelLayoutEvaluation(BaseModel):
@@ -80,6 +80,6 @@ class DatasetPixelLayoutEvaluation(BaseModel):
     matrix_evaluation: MultiLabelMatrixEvaluation
     page_evaluations: Dict[str, PagePixelLayoutEvaluation]
 
-    # Statistics across all images for f1 on all classes and on the colapsed classes
+    # Statistics across all images for f1 on all classes and on the collapsed classes
     f1_all_classes_stats: DatasetStatistics
-    f1_colapsed_classes_stats: DatasetStatistics
+    f1_collapsed_classes_stats: DatasetStatistics
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 71911c30..10e66433 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -56,9 +56,6 @@ class PixelLayoutEvaluator(BaseEvaluator):
     (precision, recall, f1).
     """
 
-    # TODO: Expose a low level API/CLI that allows to evaluate outside of DoclingDocuments with models
-    # that have totally different classes than GT, described as strings instead of DocItemLabel
-
     def __init__(
         self,
         label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
@@ -204,9 +201,9 @@ def __call__(
         pages_detailed_f1: list[float] = (
             []
         )  # Gather f1 score/image when evaluated on all classes
-        pages_colapsed_f1: list[float] = (
+        pages_collapsed_f1: list[float] = (
             []
-        )  # Gather f1 score/image when evaluated on colapsed classes
+        )  # Gather f1 score/image when evaluated on collapsed classes
 
         for i, data in tqdm(
             enumerate(ds_selection),
@@ -275,8 +272,8 @@ def __call__(
                 pages_detailed_f1.append(
                     page_matrix_evaluation.detailed.agg_metrics.classes_f1_mean
                 )
-                pages_colapsed_f1.append(
-                    page_matrix_evaluation.colapsed.agg_metrics.classes_f1_mean
+                pages_collapsed_f1.append(
+                    page_matrix_evaluation.collapsed.agg_metrics.classes_f1_mean
                 )
 
             ds_num_pixels += num_pixels
@@ -295,7 +292,7 @@ def __call__(
             matrix_evaluation=ds_matrix_evaluation,
             page_evaluations=all_pages_evaluations,
             f1_all_classes_stats=compute_stats(pages_detailed_f1),
-            f1_colapsed_classes_stats=compute_stats(pages_colapsed_f1),
+            f1_collapsed_classes_stats=compute_stats(pages_collapsed_f1),
         )
 
         return ds_evaluation
@@ -343,7 +340,7 @@ def save_evaluations(
 
         excel_exporter = ConfusionMatrixExporter()
         headers = list(self._matrix_id_to_name.values())
-        colapsed_headers: list[str] = [
+        collapsed_headers: list[str] = [
             f"{metric}: {cell}"
             for metric in ["Precision(GT/Pred)", "Recall(GT/Pred)", "F1(GT/Pred)"]
             for cell in [
@@ -353,13 +350,13 @@ def save_evaluations(
                 "cls/cls",
             ]
         ]
-        image_colapsed_aggs: Dict[str, np.ndarray] = {}
+        image_collapsed_aggs: Dict[str, np.ndarray] = {}
         for doc_page_id, page_evaluations in ds_evaluation.page_evaluations.items():
-            pm = page_evaluations.matrix_evaluation.colapsed
+            pm = page_evaluations.matrix_evaluation.collapsed
             if not pm:
                 continue
             # [12,]
-            image_colapsed_vector = np.stack(
+            image_collapsed_vector = np.stack(
                 [
                     pm.precision_matrix.flatten(),
                     pm.recall_matrix.flatten(),
@@ -367,7 +364,7 @@ def save_evaluations(
                 ],
                 axis=0,
             ).flatten()
-            image_colapsed_aggs[doc_page_id] = image_colapsed_vector
+            image_collapsed_aggs[doc_page_id] = image_collapsed_vector
 
         excel_fn = eval_fns["excel"]
 
@@ -376,8 +373,8 @@ def save_evaluations(
             ds_evaluation.num_pixels,
             headers,
             ds_evaluation.matrix_evaluation,
-            colapsed_headers,
-            image_colapsed_aggs,
+            collapsed_headers,
+            image_collapsed_aggs,
             excel_fn,
             self._layout_model_name,
         )

From 889340660a8b047dc888f2d84512ef15d6129417 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 14 Nov 2025 16:52:41 +0100
Subject: [PATCH 15/15] fix: Use plain int python type for num_pixels

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/pixel/confusion_matrix_exporter.py | 2 +-
 docling_eval/evaluators/pixel/pixel_types.py               | 2 +-
 docling_eval/evaluators/pixel_layout_evaluator.py          | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
index a6655ae0..c62cbf08 100644
--- a/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
+++ b/docling_eval/evaluators/pixel/confusion_matrix_exporter.py
@@ -107,7 +107,7 @@ def __init__(
     def build_ds_report(
         self,
         num_images: int,
-        num_pixels: np.uint64,
+        num_pixels: int,
         headers: list[str],
         matrix_evaluation: MultiLabelMatrixEvaluation,
         collapsed_headers: list[str],
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
index e813ca04..a920d522 100644
--- a/docling_eval/evaluators/pixel/pixel_types.py
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -75,7 +75,7 @@ class PagePixelLayoutEvaluation(BaseModel):
 class DatasetPixelLayoutEvaluation(BaseModel):
     layout_model_name: Optional[str]
     num_pages: int
-    num_pixels: np.uint64
+    num_pixels: int
     rejected_samples: Dict[EvaluationRejectionType, int]
     matrix_evaluation: MultiLabelMatrixEvaluation
     page_evaluations: Dict[str, PagePixelLayoutEvaluation]
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 10e66433..9bc95b05 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -196,7 +196,7 @@ def __call__(
         all_pages_evaluations: Dict[str, PagePixelLayoutEvaluation] = (
             {}
         )  # Key is doc_id-page-no
-        ds_num_pixels: np.uint64 = np.uint64(0)
+        ds_num_pixels = 0
         self._layout_model_name = None
         pages_detailed_f1: list[float] = (
             []
@@ -385,7 +385,7 @@ def _compute_document_confusion_matrix(
         pred_doc: DoclingDocument,
     ) -> Tuple[
         Dict[int, np.ndarray],  # page_no -> page confusion matrix
-        np.uint64,  # num_pixels
+        int,  # num_pixels
     ]:
         r"""
         Compute the confusion matrix for the given documents.
@@ -405,7 +405,7 @@ def _compute_document_confusion_matrix(
         num_categories = len(matrix_categories_ids)
         off_diagonal_cells = num_categories * num_categories - num_categories
         page_confusion_matrices: Dict[int, np.ndarray] = {}
-        num_pixels: np.uint64 = np.uint64(0)
+        num_pixels = 0
 
         for page_no in sorted(gt_pages):
             page_size = true_doc.pages[page_no].size