docling-project · ManeeGit · Sep 28, 2025 · Sep 28, 2025 · Sep 28, 2025 · dolfim-ibm
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## [Unreleased]
+
+### Feature
+
+* Add --stats CLI flag for detailed performance statistics ([`21504a3`](https://github.com/docling-project/docling/commit/21504a3))
+
 ## [v2.54.0](https://github.com/docling-project/docling/releases/tag/v2.54.0) - 2025-09-22
 
 ### Feature

diff --git a/CONTRIBUTION_EXAMPLE.md b/CONTRIBUTION_EXAMPLE.md
@@ -0,0 +1,32 @@
+# Example Contribution
+
+This file demonstrates a sample contribution to the Docling project.
+
+## What this shows
+- How to clone the repository
+- How to set up the development environment with `uv`
+- How to create a contribution branch
+- How to make changes and prepare for pushing
+
+## Development Setup Completed
+- ✅ Repository cloned
+- ✅ Virtual environment created with `uv sync`
+- ✅ Dependencies installed (244 packages)
+- ✅ CLI tool working (`docling --version`)
+- ✅ Pre-commit hooks installed
+- ✅ Contribution branch created
+
+## Next Steps for Contributing
+1. Fork the repository on GitHub
+2. Add your fork as remote: `git remote add fork https://github.com/YOUR-USERNAME/docling.git`
+3. Make your changes
+4. Run tests: `uv run pytest`
+5. Run pre-commit checks: `uv run pre-commit run --all-files`
+6. Commit and push to your fork
+7. Create a Pull Request
+
+## Project Information
+- **Language**: Python 3.9-3.13
+- **Package Manager**: uv
+- **Current Version**: 2.54.0
+- **Main Purpose**: Document processing and parsing (PDF, DOCX, HTML, etc.)
diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -185,6 +185,103 @@ def show_external_plugins_callback(value: bool):
         raise typer.Exit()
 
 
+def display_performance_stats(
+    conv_results: Iterable[ConversionResult], total_time: float
+):
+    """Display detailed performance statistics for the conversion process."""
+    results = list(conv_results)
+    if not results:
+        console.print("[yellow]No results to display statistics for.[/yellow]")
+        return
+
+    # Overall statistics
+    total_docs = len(results)
+    success_count = sum(1 for r in results if r.status == ConversionStatus.SUCCESS)
+    failure_count = total_docs - success_count
+
+    # Document statistics
+    total_pages = sum(len(r.pages) for r in results)
+    avg_pages_per_doc = total_pages / total_docs if total_docs > 0 else 0
+
+    # Performance metrics
+    throughput_docs = total_docs / total_time if total_time > 0 else 0
+    throughput_pages = total_pages / total_time if total_time > 0 else 0
+
+    # Create main statistics table
+    stats_table = rich.table.Table(
+        title="📊 Performance Statistics", show_header=True, header_style="bold magenta"
+    )
+    stats_table.add_column("Metric", style="cyan", justify="left")
+    stats_table.add_column("Value", justify="right")
+
+    stats_table.add_row("📄 Total Documents", f"{total_docs:,}")
+    stats_table.add_row("✅ Successful", f"{success_count:,}")
+    stats_table.add_row("❌ Failed", f"{failure_count:,}")
+    stats_table.add_row("📃 Total Pages", f"{total_pages:,}")
+    stats_table.add_row("📊 Avg Pages/Doc", f"{avg_pages_per_doc:.1f}")
+    stats_table.add_row("⏱️  Total Time", f"{total_time:.2f}s")
+    stats_table.add_row("🚀 Throughput (docs/s)", f"{throughput_docs:.2f}")
+    stats_table.add_row("📄 Throughput (pages/s)", f"{throughput_pages:.2f}")
+
+    console.print()  # Add empty line
+    console.print(stats_table)
+
+    # Pipeline timings (only if profiling was enabled)
+    timing_data = []
+    for result in results:
+        if result.timings:
+            for timing_key, timing_item in result.timings.items():
+                if timing_item.times:  # Only include timings with actual data
+                    timing_data.append(
+                        {
+                            "operation": timing_key,
+                            "total_time": sum(timing_item.times),
+                            "avg_time": timing_item.avg(),
+                            "count": timing_item.count,
+                            "min_time": min(timing_item.times),
+                            "max_time": max(timing_item.times),
+                        }
+                    )
+
+    if timing_data:
+        # Create pipeline timings table
+        timing_table = rich.table.Table(
+            title="⚙️  Pipeline Timings", show_header=True, header_style="bold blue"
+        )
+        timing_table.add_column("Operation", style="yellow", justify="left")
+        timing_table.add_column("Total (s)", justify="right")
+        timing_table.add_column("Avg (s)", justify="right")
+        timing_table.add_column("Min (s)", justify="right")
+        timing_table.add_column("Max (s)", justify="right")
+        timing_table.add_column("Count", justify="right")
+
+        # Sort by total time (descending)
+        timing_data.sort(key=lambda x: x["total_time"], reverse=True)
+
+        for timing in timing_data:
+            timing_table.add_row(
+                str(timing["operation"]),
+                f"{timing['total_time']:.3f}",
+                f"{timing['avg_time']:.3f}",
+                f"{timing['min_time']:.3f}",
+                f"{timing['max_time']:.3f}",
+                f"{timing['count']:,}",
+            )
+
+        console.print()  # Add empty line
+        console.print(timing_table)
+    else:
+        console.print()  # Add empty line
+        console.print(
+            "[yellow]💡 Tip: Enable pipeline profiling with DOCLING_DEBUG_PROFILE_PIPELINE_TIMINGS=true for detailed timing information.[/yellow]"
+        )
+        console.print(
+            "    Or it may already be enabled but no detailed timings were captured for this run."
+        )
+
+    console.print()  # Add empty line for spacing
+
+
 def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
@@ -494,6 +591,13 @@ def convert(  # noqa: C901
             help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}",
         ),
     ] = settings.perf.page_batch_size,
+    show_stats: Annotated[
+        bool,
+        typer.Option(
+            "--stats",
+            help="Display detailed performance statistics after conversion.",
+        ),
+    ] = False,
 ):
     log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
 
@@ -510,6 +614,10 @@ def convert(  # noqa: C901
     settings.debug.visualize_ocr = debug_visualize_ocr
     settings.perf.page_batch_size = page_batch_size
 
+    # Enable profiling when stats are requested
+    if show_stats:
+        settings.debug.profile_pipeline_timings = True
+
     if from_formats is None:
         from_formats = list(InputFormat)
 
@@ -755,8 +863,10 @@ def convert(  # noqa: C901
         start_time = time.time()
 
         _log.info(f"paths: {input_doc_paths}")
-        conv_results = doc_converter.convert_all(
-            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
+        conv_results = list(
+            doc_converter.convert_all(
+                input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
+            )
         )
 
         output.mkdir(parents=True, exist_ok=True)
@@ -775,6 +885,10 @@ def convert(  # noqa: C901
 
         end_time = time.time() - start_time
 
+        # Display performance statistics if requested
+        if show_stats:
+            display_performance_stats(conv_results, end_time)
+
     _log.info(f"All documents were converted in {end_time:.2f} seconds.")
 
 

diff --git a/docs/stats_feature.md b/docs/stats_feature.md
@@ -0,0 +1,107 @@
+# Performance Statistics Feature
+
+This document describes the new `--stats` performance statistics feature added to the Docling CLI.
+
+## Overview
+
+The `--stats` flag provides detailed performance metrics and timing information for document conversion operations. This feature is valuable for:
+
+- Understanding processing bottlenecks
+- Optimizing conversion workflows
+- Benchmarking performance across different systems
+- Debugging slow conversion processes
+
+## Usage
+
+Add the `--stats` flag to any `docling convert` command:
+
+```bash
+# Single document with stats
+docling document.pdf --stats
+
+# Multiple documents with stats
+docling documents/ --stats --output ./converted
+
+# With other options
+docling document.pdf --stats --to json --to md --output ./output
+```
+
+## Output Format
+
+The statistics output includes two main sections:
+
+### 1. Performance Statistics Table
+
+Shows high-level conversion metrics:
+
+- **Total Documents**: Number of documents processed
+- **Successful**: Number of successfully converted documents  
+- **Failed**: Number of failed conversions
+- **Total Pages**: Sum of all pages across documents
+- **Avg Pages/Doc**: Average pages per document
+- **Total Time**: Total processing time in seconds
+- **Throughput (docs/s)**: Documents processed per second
+- **Throughput (pages/s)**: Pages processed per second
+
+### 2. Pipeline Timings Table
+
+Provides detailed breakdown of processing time by pipeline operation:
+
+- **Operation**: Name of the pipeline stage (e.g., layout, table_structure, ocr)
+- **Total (s)**: Total time spent in this operation across all documents
+- **Avg (s)**: Average time per operation instance
+- **Min (s)**: Minimum time observed
+- **Max (s)**: Maximum time observed  
+- **Count**: Number of times this operation was executed
+
+## Implementation Details
+
+- Enabling `--stats` automatically enables pipeline profiling (`DOCLING_DEBUG_PROFILE_PIPELINE_TIMINGS=true`)
+- Statistics are collected during processing and displayed after completion
+- The feature works with single documents, multiple documents, and batch processing
+- All timing measurements use high-precision monotonic time
+
+## Example Output
+
+```
+     📊 Performance Statistics     
+┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
+┃ Metric                  ┃ Value ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
+│ 📄 Total Documents      │     1 │
+│ ✅ Successful           │     1 │
+│ ❌ Failed               │     0 │
+│ 📃 Total Pages          │     1 │
+│ 📊 Avg Pages/Doc        │   1.0 │
+│ ⏱️  Total Time           │ 5.13s │
+│ 🚀 Throughput (docs/s)  │  0.20 │
+│ 📄 Throughput (pages/s) │  0.20 │
+└─────────────────────────┴───────┘
+
+                         ⚙️  Pipeline Timings                         
+┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┓
+┃ Operation       ┃ Total (s) ┃ Avg (s) ┃ Min (s) ┃ Max (s) ┃ Count ┃
+┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━┩
+│ pipeline_total  │     1.456 │   1.456 │   1.456 │   1.456 │     1 │
+│ doc_build       │     1.410 │   1.410 │   1.410 │   1.410 │     1 │
+│ table_structure │     0.673 │   0.673 │   0.673 │   0.673 │     1 │
+│ layout          │     0.508 │   0.508 │   0.508 │   0.508 │     1 │
+│ ocr             │     0.115 │   0.115 │   0.115 │   0.115 │     1 │
+│ page_parse      │     0.061 │   0.061 │   0.061 │   0.061 │     1 │
+│ doc_assemble    │     0.046 │   0.046 │   0.046 │   0.046 │     1 │
+│ page_init       │     0.045 │   0.045 │   0.045 │   0.045 │     1 │
+│ reading_order   │     0.005 │   0.005 │   0.005 │   0.005 │     1 │
+│ page_assemble   │     0.001 │   0.001 │   0.001 │   0.001 │     1 │
+│ doc_enrich      │     0.000 │   0.000 │   0.000 │   0.000 │     1 │
+└─────────────────┴───────────┴─────────┴─────────┴─────────┴───────┘
+```
+
+## Performance Insights
+
+From the example above, you can see that:
+
+- **Table structure detection** (0.673s) and **layout analysis** (0.508s) consume most processing time
+- **OCR processing** takes 0.115s for this document
+- **Document parsing** and **assembly** are relatively fast operations
+
+This information helps identify optimization opportunities and understand where processing time is spent.
diff --git a/examples/stats_demo.py b/examples/stats_demo.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating the new --stats performance feature in Docling CLI.
+
+This script shows how the --stats flag provides detailed performance insights
+for document conversion operations.
+"""
+
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_docling_with_stats():
+    """Demonstrate the --stats feature with example documents."""
+
+    print("🚀 Docling CLI Performance Statistics Demo")
+    print("=" * 50)
+    print()
+
+    # Example 1: Single document with stats
+    print("📄 Example 1: Single Document Performance Analysis")
+    print("-" * 40)
+
+    cmd = [
+        "docling", 
+        "tests/data/pdf/2305.03393v1-pg9.pdf", 
+        "--stats", 
+        "--output", "/tmp/stats_demo_single"
+    ]
+
+    print(f"Command: {' '.join(cmd)}")
+    print()
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        print("✅ Conversion completed successfully!")
+        print("\nOutput:")
+        print(result.stdout)
+        if result.stderr:
+            print("Warnings/Info:")
+            print(result.stderr)
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error: {e}")
+        return
+
+    print("\n" + "=" * 50)
+    print()
+
+    # Example 2: Multiple documents with stats
+    print("📄 Example 2: Batch Processing Performance Analysis")
+    print("-" * 40)
+
+    cmd = [
+        "docling", 
+        "tests/data/pdf/2305.03393v1-pg9.pdf",
+        "tests/data/pdf/code_and_formula.pdf",
+        "--stats", 
+        "--output", "/tmp/stats_demo_batch"
+    ]
+
+    print(f"Command: {' '.join(cmd)}")
+    print()
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        print("✅ Batch conversion completed successfully!")
+        print("\nOutput:")
+        print(result.stdout)
+        if result.stderr:
+            print("Warnings/Info:")
+            print(result.stderr)
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error: {e}")
+        return
+
+    print("\n🎉 Demo completed! The --stats feature provides valuable insights into:")
+    print("  • Overall conversion performance (throughput, timing)")
+    print("  • Detailed pipeline operation breakdowns")
+    print("  • Processing bottlenecks identification")
+    print("  • Batch processing analytics")
+
+
+if __name__ == "__main__":
+    # Check if we're in the right directory
+    if not Path("tests/data/pdf").exists():
+        print("❌ Error: This script must be run from the Docling repository root directory")
+        print("   Please run: cd /path/to/docling && python examples/stats_demo.py")
+        sys.exit(1)
+
+    run_docling_with_stats()