Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## [Unreleased]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please do not touch the CHANGELOG. This will be done by the CI/CD automatically.


### Feature

* Add --stats CLI flag for detailed performance statistics ([`21504a3`](https://github.com/docling-project/docling/commit/21504a3))

## [v2.54.0](https://github.com/docling-project/docling/releases/tag/v2.54.0) - 2025-09-22

### Feature
Expand Down
32 changes: 32 additions & 0 deletions CONTRIBUTION_EXAMPLE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Example Contribution
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this. We have similar steps outlined in the CONTRIBUTING file. If you would like to propose changes there, please do it in a standalone PR.


This file demonstrates a sample contribution to the Docling project.

## What this shows
- How to clone the repository
- How to set up the development environment with `uv`
- How to create a contribution branch
- How to make changes and prepare for pushing

## Development Setup Completed
- ✅ Repository cloned
- ✅ Virtual environment created with `uv sync`
- ✅ Dependencies installed (244 packages)
- ✅ CLI tool working (`docling --version`)
- ✅ Pre-commit hooks installed
- ✅ Contribution branch created

## Next Steps for Contributing
1. Fork the repository on GitHub
2. Add your fork as remote: `git remote add fork https://github.com/YOUR-USERNAME/docling.git`
3. Make your changes
4. Run tests: `uv run pytest`
5. Run pre-commit checks: `uv run pre-commit run --all-files`
6. Commit and push to your fork
7. Create a Pull Request

## Project Information
- **Language**: Python 3.9-3.13
- **Package Manager**: uv
- **Current Version**: 2.54.0
- **Main Purpose**: Document processing and parsing (PDF, DOCX, HTML, etc.)
118 changes: 116 additions & 2 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,103 @@ def show_external_plugins_callback(value: bool):
raise typer.Exit()


def display_performance_stats(
conv_results: Iterable[ConversionResult], total_time: float
):
"""Display detailed performance statistics for the conversion process."""
results = list(conv_results)
if not results:
console.print("[yellow]No results to display statistics for.[/yellow]")
return

# Overall statistics
total_docs = len(results)
success_count = sum(1 for r in results if r.status == ConversionStatus.SUCCESS)
failure_count = total_docs - success_count

# Document statistics
total_pages = sum(len(r.pages) for r in results)
avg_pages_per_doc = total_pages / total_docs if total_docs > 0 else 0

# Performance metrics
throughput_docs = total_docs / total_time if total_time > 0 else 0
throughput_pages = total_pages / total_time if total_time > 0 else 0

# Create main statistics table
stats_table = rich.table.Table(
title="📊 Performance Statistics", show_header=True, header_style="bold magenta"
)
stats_table.add_column("Metric", style="cyan", justify="left")
stats_table.add_column("Value", justify="right")

stats_table.add_row("📄 Total Documents", f"{total_docs:,}")
stats_table.add_row("✅ Successful", f"{success_count:,}")
stats_table.add_row("❌ Failed", f"{failure_count:,}")
stats_table.add_row("📃 Total Pages", f"{total_pages:,}")
stats_table.add_row("📊 Avg Pages/Doc", f"{avg_pages_per_doc:.1f}")
stats_table.add_row("⏱️ Total Time", f"{total_time:.2f}s")
stats_table.add_row("🚀 Throughput (docs/s)", f"{throughput_docs:.2f}")
stats_table.add_row("📄 Throughput (pages/s)", f"{throughput_pages:.2f}")

console.print() # Add empty line
console.print(stats_table)

# Pipeline timings (only if profiling was enabled)
timing_data = []
for result in results:
if result.timings:
for timing_key, timing_item in result.timings.items():
if timing_item.times: # Only include timings with actual data
timing_data.append(
{
"operation": timing_key,
"total_time": sum(timing_item.times),
"avg_time": timing_item.avg(),
"count": timing_item.count,
"min_time": min(timing_item.times),
"max_time": max(timing_item.times),
}
)

if timing_data:
# Create pipeline timings table
timing_table = rich.table.Table(
title="⚙️ Pipeline Timings", show_header=True, header_style="bold blue"
)
timing_table.add_column("Operation", style="yellow", justify="left")
timing_table.add_column("Total (s)", justify="right")
timing_table.add_column("Avg (s)", justify="right")
timing_table.add_column("Min (s)", justify="right")
timing_table.add_column("Max (s)", justify="right")
timing_table.add_column("Count", justify="right")

# Sort by total time (descending)
timing_data.sort(key=lambda x: x["total_time"], reverse=True)

for timing in timing_data:
timing_table.add_row(
str(timing["operation"]),
f"{timing['total_time']:.3f}",
f"{timing['avg_time']:.3f}",
f"{timing['min_time']:.3f}",
f"{timing['max_time']:.3f}",
f"{timing['count']:,}",
)

console.print() # Add empty line
console.print(timing_table)
else:
console.print() # Add empty line
console.print(
"[yellow]💡 Tip: Enable pipeline profiling with DOCLING_DEBUG_PROFILE_PIPELINE_TIMINGS=true for detailed timing information.[/yellow]"
)
console.print(
" Or it may already be enabled but no detailed timings were captured for this run."
)

console.print() # Add empty line for spacing


def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
Expand Down Expand Up @@ -494,6 +591,13 @@ def convert( # noqa: C901
help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}",
),
] = settings.perf.page_batch_size,
show_stats: Annotated[
bool,
typer.Option(
"--stats",
help="Display detailed performance statistics after conversion.",
),
] = False,
):
log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"

Expand All @@ -510,6 +614,10 @@ def convert( # noqa: C901
settings.debug.visualize_ocr = debug_visualize_ocr
settings.perf.page_batch_size = page_batch_size

# Enable profiling when stats are requested
if show_stats:
settings.debug.profile_pipeline_timings = True

if from_formats is None:
from_formats = list(InputFormat)

Expand Down Expand Up @@ -755,8 +863,10 @@ def convert( # noqa: C901
start_time = time.time()

_log.info(f"paths: {input_doc_paths}")
conv_results = doc_converter.convert_all(
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
conv_results = list(
doc_converter.convert_all(
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
)
)

output.mkdir(parents=True, exist_ok=True)
Expand All @@ -775,6 +885,10 @@ def convert( # noqa: C901

end_time = time.time() - start_time

# Display performance statistics if requested
if show_stats:
display_performance_stats(conv_results, end_time)

_log.info(f"All documents were converted in {end_time:.2f} seconds.")


Expand Down
107 changes: 107 additions & 0 deletions docs/stats_feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Performance Statistics Feature
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We won't add a standalone page for this feature. Please rework it and add it as an addition to advanced_options.md.

The section could be about the performance statistics in general and a subsection (shorter than this) for how to use it in the CLI.


This document describes the new `--stats` performance statistics feature added to the Docling CLI.

## Overview

The `--stats` flag provides detailed performance metrics and timing information for document conversion operations. This feature is valuable for:

- Understanding processing bottlenecks
- Optimizing conversion workflows
- Benchmarking performance across different systems
- Debugging slow conversion processes

## Usage

Add the `--stats` flag to any `docling convert` command:

```bash
# Single document with stats
docling document.pdf --stats

# Multiple documents with stats
docling documents/ --stats --output ./converted

# With other options
docling document.pdf --stats --to json --to md --output ./output
```

## Output Format

The statistics output includes two main sections:

### 1. Performance Statistics Table

Shows high-level conversion metrics:

- **Total Documents**: Number of documents processed
- **Successful**: Number of successfully converted documents
- **Failed**: Number of failed conversions
- **Total Pages**: Sum of all pages across documents
- **Avg Pages/Doc**: Average pages per document
- **Total Time**: Total processing time in seconds
- **Throughput (docs/s)**: Documents processed per second
- **Throughput (pages/s)**: Pages processed per second

### 2. Pipeline Timings Table

Provides detailed breakdown of processing time by pipeline operation:

- **Operation**: Name of the pipeline stage (e.g., layout, table_structure, ocr)
- **Total (s)**: Total time spent in this operation across all documents
- **Avg (s)**: Average time per operation instance
- **Min (s)**: Minimum time observed
- **Max (s)**: Maximum time observed
- **Count**: Number of times this operation was executed

## Implementation Details

- Enabling `--stats` automatically enables pipeline profiling (`DOCLING_DEBUG_PROFILE_PIPELINE_TIMINGS=true`)
- Statistics are collected during processing and displayed after completion
- The feature works with single documents, multiple documents, and batch processing
- All timing measurements use high-precision monotonic time

## Example Output

```
📊 Performance Statistics
┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓
┃ Metric ┃ Value ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩
│ 📄 Total Documents │ 1 │
│ ✅ Successful │ 1 │
│ ❌ Failed │ 0 │
│ 📃 Total Pages │ 1 │
│ 📊 Avg Pages/Doc │ 1.0 │
│ ⏱️ Total Time │ 5.13s │
│ 🚀 Throughput (docs/s) │ 0.20 │
│ 📄 Throughput (pages/s) │ 0.20 │
└─────────────────────────┴───────┘
⚙️ Pipeline Timings
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┓
┃ Operation ┃ Total (s) ┃ Avg (s) ┃ Min (s) ┃ Max (s) ┃ Count ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━┩
│ pipeline_total │ 1.456 │ 1.456 │ 1.456 │ 1.456 │ 1 │
│ doc_build │ 1.410 │ 1.410 │ 1.410 │ 1.410 │ 1 │
│ table_structure │ 0.673 │ 0.673 │ 0.673 │ 0.673 │ 1 │
│ layout │ 0.508 │ 0.508 │ 0.508 │ 0.508 │ 1 │
│ ocr │ 0.115 │ 0.115 │ 0.115 │ 0.115 │ 1 │
│ page_parse │ 0.061 │ 0.061 │ 0.061 │ 0.061 │ 1 │
│ doc_assemble │ 0.046 │ 0.046 │ 0.046 │ 0.046 │ 1 │
│ page_init │ 0.045 │ 0.045 │ 0.045 │ 0.045 │ 1 │
│ reading_order │ 0.005 │ 0.005 │ 0.005 │ 0.005 │ 1 │
│ page_assemble │ 0.001 │ 0.001 │ 0.001 │ 0.001 │ 1 │
│ doc_enrich │ 0.000 │ 0.000 │ 0.000 │ 0.000 │ 1 │
└─────────────────┴───────────┴─────────┴─────────┴─────────┴───────┘
```

## Performance Insights

From the example above, you can see that:

- **Table structure detection** (0.673s) and **layout analysis** (0.508s) consume most processing time
- **OCR processing** takes 0.115s for this document
- **Document parsing** and **assembly** are relatively fast operations

This information helps identify optimization opportunities and understand where processing time is spent.
91 changes: 91 additions & 0 deletions examples/stats_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need an example for the CLI. Please remove.

"""
Example script demonstrating the new --stats performance feature in Docling CLI.
This script shows how the --stats flag provides detailed performance insights
for document conversion operations.
"""

import subprocess
import sys
from pathlib import Path


def run_docling_with_stats():
"""Demonstrate the --stats feature with example documents."""

print("🚀 Docling CLI Performance Statistics Demo")
print("=" * 50)
print()

# Example 1: Single document with stats
print("📄 Example 1: Single Document Performance Analysis")
print("-" * 40)

cmd = [
"docling",
"tests/data/pdf/2305.03393v1-pg9.pdf",
"--stats",
"--output", "/tmp/stats_demo_single"
]

print(f"Command: {' '.join(cmd)}")
print()

try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
print("✅ Conversion completed successfully!")
print("\nOutput:")
print(result.stdout)
if result.stderr:
print("Warnings/Info:")
print(result.stderr)
except subprocess.CalledProcessError as e:
print(f"❌ Error: {e}")
return

print("\n" + "=" * 50)
print()

# Example 2: Multiple documents with stats
print("📄 Example 2: Batch Processing Performance Analysis")
print("-" * 40)

cmd = [
"docling",
"tests/data/pdf/2305.03393v1-pg9.pdf",
"tests/data/pdf/code_and_formula.pdf",
"--stats",
"--output", "/tmp/stats_demo_batch"
]

print(f"Command: {' '.join(cmd)}")
print()

try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
print("✅ Batch conversion completed successfully!")
print("\nOutput:")
print(result.stdout)
if result.stderr:
print("Warnings/Info:")
print(result.stderr)
except subprocess.CalledProcessError as e:
print(f"❌ Error: {e}")
return

print("\n🎉 Demo completed! The --stats feature provides valuable insights into:")
print(" • Overall conversion performance (throughput, timing)")
print(" • Detailed pipeline operation breakdowns")
print(" • Processing bottlenecks identification")
print(" • Batch processing analytics")


if __name__ == "__main__":
# Check if we're in the right directory
if not Path("tests/data/pdf").exists():
print("❌ Error: This script must be run from the Docling repository root directory")
print(" Please run: cd /path/to/docling && python examples/stats_demo.py")
sys.exit(1)

run_docling_with_stats()
Loading
Loading