diff --git a/CHANGELOG.md b/CHANGELOG.md index 65abe2b2c..dd9460635 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## [Unreleased] + +### Feature + +* Add --stats CLI flag for detailed performance statistics ([`21504a3`](https://github.com/docling-project/docling/commit/21504a3)) + ## [v2.54.0](https://github.com/docling-project/docling/releases/tag/v2.54.0) - 2025-09-22 ### Feature diff --git a/CONTRIBUTION_EXAMPLE.md b/CONTRIBUTION_EXAMPLE.md new file mode 100644 index 000000000..74eb3ae32 --- /dev/null +++ b/CONTRIBUTION_EXAMPLE.md @@ -0,0 +1,32 @@ +# Example Contribution + +This file demonstrates a sample contribution to the Docling project. + +## What this shows +- How to clone the repository +- How to set up the development environment with `uv` +- How to create a contribution branch +- How to make changes and prepare for pushing + +## Development Setup Completed +- ✅ Repository cloned +- ✅ Virtual environment created with `uv sync` +- ✅ Dependencies installed (244 packages) +- ✅ CLI tool working (`docling --version`) +- ✅ Pre-commit hooks installed +- ✅ Contribution branch created + +## Next Steps for Contributing +1. Fork the repository on GitHub +2. Add your fork as remote: `git remote add fork https://github.com/YOUR-USERNAME/docling.git` +3. Make your changes +4. Run tests: `uv run pytest` +5. Run pre-commit checks: `uv run pre-commit run --all-files` +6. Commit and push to your fork +7. Create a Pull Request + +## Project Information +- **Language**: Python 3.9-3.13 +- **Package Manager**: uv +- **Current Version**: 2.54.0 +- **Main Purpose**: Document processing and parsing (PDF, DOCX, HTML, etc.) \ No newline at end of file diff --git a/docling/cli/main.py b/docling/cli/main.py index 2177b7886..cf3d7bbe1 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -185,6 +185,103 @@ def show_external_plugins_callback(value: bool): raise typer.Exit() +def display_performance_stats( + conv_results: Iterable[ConversionResult], total_time: float +): + """Display detailed performance statistics for the conversion process.""" + results = list(conv_results) + if not results: + console.print("[yellow]No results to display statistics for.[/yellow]") + return + + # Overall statistics + total_docs = len(results) + success_count = sum(1 for r in results if r.status == ConversionStatus.SUCCESS) + failure_count = total_docs - success_count + + # Document statistics + total_pages = sum(len(r.pages) for r in results) + avg_pages_per_doc = total_pages / total_docs if total_docs > 0 else 0 + + # Performance metrics + throughput_docs = total_docs / total_time if total_time > 0 else 0 + throughput_pages = total_pages / total_time if total_time > 0 else 0 + + # Create main statistics table + stats_table = rich.table.Table( + title="📊 Performance Statistics", show_header=True, header_style="bold magenta" + ) + stats_table.add_column("Metric", style="cyan", justify="left") + stats_table.add_column("Value", justify="right") + + stats_table.add_row("📄 Total Documents", f"{total_docs:,}") + stats_table.add_row("✅ Successful", f"{success_count:,}") + stats_table.add_row("❌ Failed", f"{failure_count:,}") + stats_table.add_row("📃 Total Pages", f"{total_pages:,}") + stats_table.add_row("📊 Avg Pages/Doc", f"{avg_pages_per_doc:.1f}") + stats_table.add_row("⏱️ Total Time", f"{total_time:.2f}s") + stats_table.add_row("🚀 Throughput (docs/s)", f"{throughput_docs:.2f}") + stats_table.add_row("📄 Throughput (pages/s)", f"{throughput_pages:.2f}") + + console.print() # Add empty line + console.print(stats_table) + + # Pipeline timings (only if profiling was enabled) + timing_data = [] + for result in results: + if result.timings: + for timing_key, timing_item in result.timings.items(): + if timing_item.times: # Only include timings with actual data + timing_data.append( + { + "operation": timing_key, + "total_time": sum(timing_item.times), + "avg_time": timing_item.avg(), + "count": timing_item.count, + "min_time": min(timing_item.times), + "max_time": max(timing_item.times), + } + ) + + if timing_data: + # Create pipeline timings table + timing_table = rich.table.Table( + title="⚙️ Pipeline Timings", show_header=True, header_style="bold blue" + ) + timing_table.add_column("Operation", style="yellow", justify="left") + timing_table.add_column("Total (s)", justify="right") + timing_table.add_column("Avg (s)", justify="right") + timing_table.add_column("Min (s)", justify="right") + timing_table.add_column("Max (s)", justify="right") + timing_table.add_column("Count", justify="right") + + # Sort by total time (descending) + timing_data.sort(key=lambda x: x["total_time"], reverse=True) + + for timing in timing_data: + timing_table.add_row( + str(timing["operation"]), + f"{timing['total_time']:.3f}", + f"{timing['avg_time']:.3f}", + f"{timing['min_time']:.3f}", + f"{timing['max_time']:.3f}", + f"{timing['count']:,}", + ) + + console.print() # Add empty line + console.print(timing_table) + else: + console.print() # Add empty line + console.print( + "[yellow]💡 Tip: Enable pipeline profiling with DOCLING_DEBUG_PROFILE_PIPELINE_TIMINGS=true for detailed timing information.[/yellow]" + ) + console.print( + " Or it may already be enabled but no detailed timings were captured for this run." + ) + + console.print() # Add empty line for spacing + + def export_documents( conv_results: Iterable[ConversionResult], output_dir: Path, @@ -494,6 +591,13 @@ def convert( # noqa: C901 help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}", ), ] = settings.perf.page_batch_size, + show_stats: Annotated[ + bool, + typer.Option( + "--stats", + help="Display detailed performance statistics after conversion.", + ), + ] = False, ): log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s" @@ -510,6 +614,10 @@ def convert( # noqa: C901 settings.debug.visualize_ocr = debug_visualize_ocr settings.perf.page_batch_size = page_batch_size + # Enable profiling when stats are requested + if show_stats: + settings.debug.profile_pipeline_timings = True + if from_formats is None: from_formats = list(InputFormat) @@ -755,8 +863,10 @@ def convert( # noqa: C901 start_time = time.time() _log.info(f"paths: {input_doc_paths}") - conv_results = doc_converter.convert_all( - input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error + conv_results = list( + doc_converter.convert_all( + input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error + ) ) output.mkdir(parents=True, exist_ok=True) @@ -775,6 +885,10 @@ def convert( # noqa: C901 end_time = time.time() - start_time + # Display performance statistics if requested + if show_stats: + display_performance_stats(conv_results, end_time) + _log.info(f"All documents were converted in {end_time:.2f} seconds.") diff --git a/docs/stats_feature.md b/docs/stats_feature.md new file mode 100644 index 000000000..ff5174eb9 --- /dev/null +++ b/docs/stats_feature.md @@ -0,0 +1,107 @@ +# Performance Statistics Feature + +This document describes the new `--stats` performance statistics feature added to the Docling CLI. + +## Overview + +The `--stats` flag provides detailed performance metrics and timing information for document conversion operations. This feature is valuable for: + +- Understanding processing bottlenecks +- Optimizing conversion workflows +- Benchmarking performance across different systems +- Debugging slow conversion processes + +## Usage + +Add the `--stats` flag to any `docling convert` command: + +```bash +# Single document with stats +docling document.pdf --stats + +# Multiple documents with stats +docling documents/ --stats --output ./converted + +# With other options +docling document.pdf --stats --to json --to md --output ./output +``` + +## Output Format + +The statistics output includes two main sections: + +### 1. Performance Statistics Table + +Shows high-level conversion metrics: + +- **Total Documents**: Number of documents processed +- **Successful**: Number of successfully converted documents +- **Failed**: Number of failed conversions +- **Total Pages**: Sum of all pages across documents +- **Avg Pages/Doc**: Average pages per document +- **Total Time**: Total processing time in seconds +- **Throughput (docs/s)**: Documents processed per second +- **Throughput (pages/s)**: Pages processed per second + +### 2. Pipeline Timings Table + +Provides detailed breakdown of processing time by pipeline operation: + +- **Operation**: Name of the pipeline stage (e.g., layout, table_structure, ocr) +- **Total (s)**: Total time spent in this operation across all documents +- **Avg (s)**: Average time per operation instance +- **Min (s)**: Minimum time observed +- **Max (s)**: Maximum time observed +- **Count**: Number of times this operation was executed + +## Implementation Details + +- Enabling `--stats` automatically enables pipeline profiling (`DOCLING_DEBUG_PROFILE_PIPELINE_TIMINGS=true`) +- Statistics are collected during processing and displayed after completion +- The feature works with single documents, multiple documents, and batch processing +- All timing measurements use high-precision monotonic time + +## Example Output + +``` + 📊 Performance Statistics +┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓ +┃ Metric ┃ Value ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩ +│ 📄 Total Documents │ 1 │ +│ ✅ Successful │ 1 │ +│ ❌ Failed │ 0 │ +│ 📃 Total Pages │ 1 │ +│ 📊 Avg Pages/Doc │ 1.0 │ +│ ⏱️ Total Time │ 5.13s │ +│ 🚀 Throughput (docs/s) │ 0.20 │ +│ 📄 Throughput (pages/s) │ 0.20 │ +└─────────────────────────┴───────┘ + + ⚙️ Pipeline Timings +┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━┓ +┃ Operation ┃ Total (s) ┃ Avg (s) ┃ Min (s) ┃ Max (s) ┃ Count ┃ +┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━┩ +│ pipeline_total │ 1.456 │ 1.456 │ 1.456 │ 1.456 │ 1 │ +│ doc_build │ 1.410 │ 1.410 │ 1.410 │ 1.410 │ 1 │ +│ table_structure │ 0.673 │ 0.673 │ 0.673 │ 0.673 │ 1 │ +│ layout │ 0.508 │ 0.508 │ 0.508 │ 0.508 │ 1 │ +│ ocr │ 0.115 │ 0.115 │ 0.115 │ 0.115 │ 1 │ +│ page_parse │ 0.061 │ 0.061 │ 0.061 │ 0.061 │ 1 │ +│ doc_assemble │ 0.046 │ 0.046 │ 0.046 │ 0.046 │ 1 │ +│ page_init │ 0.045 │ 0.045 │ 0.045 │ 0.045 │ 1 │ +│ reading_order │ 0.005 │ 0.005 │ 0.005 │ 0.005 │ 1 │ +│ page_assemble │ 0.001 │ 0.001 │ 0.001 │ 0.001 │ 1 │ +│ doc_enrich │ 0.000 │ 0.000 │ 0.000 │ 0.000 │ 1 │ +└─────────────────┴───────────┴─────────┴─────────┴─────────┴───────┘ +``` + +## Performance Insights + +From the example above, you can see that: + +- **Table structure detection** (0.673s) and **layout analysis** (0.508s) consume most processing time +- **OCR processing** takes 0.115s for this document +- **Document parsing** and **assembly** are relatively fast operations + +This information helps identify optimization opportunities and understand where processing time is spent. \ No newline at end of file diff --git a/examples/stats_demo.py b/examples/stats_demo.py new file mode 100644 index 000000000..4525fb6e6 --- /dev/null +++ b/examples/stats_demo.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Example script demonstrating the new --stats performance feature in Docling CLI. + +This script shows how the --stats flag provides detailed performance insights +for document conversion operations. +""" + +import subprocess +import sys +from pathlib import Path + + +def run_docling_with_stats(): + """Demonstrate the --stats feature with example documents.""" + + print("🚀 Docling CLI Performance Statistics Demo") + print("=" * 50) + print() + + # Example 1: Single document with stats + print("📄 Example 1: Single Document Performance Analysis") + print("-" * 40) + + cmd = [ + "docling", + "tests/data/pdf/2305.03393v1-pg9.pdf", + "--stats", + "--output", "/tmp/stats_demo_single" + ] + + print(f"Command: {' '.join(cmd)}") + print() + + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + print("✅ Conversion completed successfully!") + print("\nOutput:") + print(result.stdout) + if result.stderr: + print("Warnings/Info:") + print(result.stderr) + except subprocess.CalledProcessError as e: + print(f"❌ Error: {e}") + return + + print("\n" + "=" * 50) + print() + + # Example 2: Multiple documents with stats + print("📄 Example 2: Batch Processing Performance Analysis") + print("-" * 40) + + cmd = [ + "docling", + "tests/data/pdf/2305.03393v1-pg9.pdf", + "tests/data/pdf/code_and_formula.pdf", + "--stats", + "--output", "/tmp/stats_demo_batch" + ] + + print(f"Command: {' '.join(cmd)}") + print() + + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + print("✅ Batch conversion completed successfully!") + print("\nOutput:") + print(result.stdout) + if result.stderr: + print("Warnings/Info:") + print(result.stderr) + except subprocess.CalledProcessError as e: + print(f"❌ Error: {e}") + return + + print("\n🎉 Demo completed! The --stats feature provides valuable insights into:") + print(" • Overall conversion performance (throughput, timing)") + print(" • Detailed pipeline operation breakdowns") + print(" • Processing bottlenecks identification") + print(" • Batch processing analytics") + + +if __name__ == "__main__": + # Check if we're in the right directory + if not Path("tests/data/pdf").exists(): + print("❌ Error: This script must be run from the Docling repository root directory") + print(" Please run: cd /path/to/docling && python examples/stats_demo.py") + sys.exit(1) + + run_docling_with_stats() \ No newline at end of file diff --git a/tests/test_cli.py b/tests/test_cli.py index 4364df8bd..33c3469a6 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -25,3 +25,28 @@ def test_cli_convert(tmp_path): assert result.exit_code == 0 converted = output / f"{Path(source).stem}.md" assert converted.exists() + + +def test_cli_stats_feature(tmp_path): + """Test the new --stats feature displays performance statistics.""" + source = "./tests/data/pdf/2305.03393v1-pg9.pdf" + output = tmp_path / "out" + output.mkdir() + result = runner.invoke(app, [source, "--stats", "--output", str(output)]) + assert result.exit_code == 0 + + # Check that the stats output contains expected sections + output_text = result.stdout + assert "📊 Performance Statistics" in output_text + assert "Total Documents" in output_text + assert "Successful" in output_text + assert "Total Time" in output_text + assert "Throughput" in output_text + + # Check that pipeline timings are included + assert "⚙️ Pipeline Timings" in output_text + assert "Operation" in output_text + + # Verify the converted file exists + converted = output / f"{Path(source).stem}.md" + assert converted.exists()