Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,14 @@ jobs:
~/.EasyOCR/
key: models-cache

- name: Pre-download Models
run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])"
- name: Free up disk space
run: |
df -h
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo apt-get clean
df -h
- name: Run examples
run: |
Expand Down
103 changes: 74 additions & 29 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,23 @@
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_BASE,
WHISPER_BASE_MLX,
WHISPER_BASE_NATIVE,
WHISPER_LARGE,
WHISPER_LARGE_MLX,
WHISPER_LARGE_NATIVE,
WHISPER_MEDIUM,
WHISPER_MEDIUM_MLX,
WHISPER_MEDIUM_NATIVE,
WHISPER_SMALL,
WHISPER_SMALL_MLX,
WHISPER_SMALL_NATIVE,
WHISPER_TINY,
WHISPER_TINY_MLX,
WHISPER_TINY_NATIVE,
WHISPER_TURBO,
WHISPER_TURBO_MLX,
WHISPER_TURBO_NATIVE,
AsrModelType,
)
from docling.datamodel.base_models import (
Expand Down Expand Up @@ -611,6 +623,7 @@ def convert( # noqa: C901
ocr_options.psm = psm

accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)

# pipeline_options: PaginatedPipelineOptions
pipeline_options: PipelineOptions

Expand Down Expand Up @@ -747,42 +760,74 @@ def convert( # noqa: C901
InputFormat.IMAGE: pdf_format_option,
}

elif pipeline == ProcessingPipeline.ASR:
pipeline_options = AsrPipelineOptions(
# enable_remote_services=enable_remote_services,
# artifacts_path = artifacts_path
)
# Set ASR options
asr_pipeline_options = AsrPipelineOptions(
accelerator_options=AcceleratorOptions(
device=device,
num_threads=num_threads,
),
# enable_remote_services=enable_remote_services,
# artifacts_path = artifacts_path
)

if asr_model == AsrModelType.WHISPER_TINY:
pipeline_options.asr_options = WHISPER_TINY
elif asr_model == AsrModelType.WHISPER_SMALL:
pipeline_options.asr_options = WHISPER_SMALL
elif asr_model == AsrModelType.WHISPER_MEDIUM:
pipeline_options.asr_options = WHISPER_MEDIUM
elif asr_model == AsrModelType.WHISPER_BASE:
pipeline_options.asr_options = WHISPER_BASE
elif asr_model == AsrModelType.WHISPER_LARGE:
pipeline_options.asr_options = WHISPER_LARGE
elif asr_model == AsrModelType.WHISPER_TURBO:
pipeline_options.asr_options = WHISPER_TURBO
else:
_log.error(f"{asr_model} is not known")
raise ValueError(f"{asr_model} is not known")
# Auto-selecting models (choose best implementation for hardware)
if asr_model == AsrModelType.WHISPER_TINY:
asr_pipeline_options.asr_options = WHISPER_TINY
elif asr_model == AsrModelType.WHISPER_SMALL:
asr_pipeline_options.asr_options = WHISPER_SMALL
elif asr_model == AsrModelType.WHISPER_MEDIUM:
asr_pipeline_options.asr_options = WHISPER_MEDIUM
elif asr_model == AsrModelType.WHISPER_BASE:
asr_pipeline_options.asr_options = WHISPER_BASE
elif asr_model == AsrModelType.WHISPER_LARGE:
asr_pipeline_options.asr_options = WHISPER_LARGE
elif asr_model == AsrModelType.WHISPER_TURBO:
asr_pipeline_options.asr_options = WHISPER_TURBO

# Explicit MLX models (force MLX implementation)
elif asr_model == AsrModelType.WHISPER_TINY_MLX:
asr_pipeline_options.asr_options = WHISPER_TINY_MLX
elif asr_model == AsrModelType.WHISPER_SMALL_MLX:
asr_pipeline_options.asr_options = WHISPER_SMALL_MLX
elif asr_model == AsrModelType.WHISPER_MEDIUM_MLX:
asr_pipeline_options.asr_options = WHISPER_MEDIUM_MLX
elif asr_model == AsrModelType.WHISPER_BASE_MLX:
asr_pipeline_options.asr_options = WHISPER_BASE_MLX
elif asr_model == AsrModelType.WHISPER_LARGE_MLX:
asr_pipeline_options.asr_options = WHISPER_LARGE_MLX
elif asr_model == AsrModelType.WHISPER_TURBO_MLX:
asr_pipeline_options.asr_options = WHISPER_TURBO_MLX

# Explicit Native models (force native implementation)
elif asr_model == AsrModelType.WHISPER_TINY_NATIVE:
asr_pipeline_options.asr_options = WHISPER_TINY_NATIVE
elif asr_model == AsrModelType.WHISPER_SMALL_NATIVE:
asr_pipeline_options.asr_options = WHISPER_SMALL_NATIVE
elif asr_model == AsrModelType.WHISPER_MEDIUM_NATIVE:
asr_pipeline_options.asr_options = WHISPER_MEDIUM_NATIVE
elif asr_model == AsrModelType.WHISPER_BASE_NATIVE:
asr_pipeline_options.asr_options = WHISPER_BASE_NATIVE
elif asr_model == AsrModelType.WHISPER_LARGE_NATIVE:
asr_pipeline_options.asr_options = WHISPER_LARGE_NATIVE
elif asr_model == AsrModelType.WHISPER_TURBO_NATIVE:
asr_pipeline_options.asr_options = WHISPER_TURBO_NATIVE

_log.info(f"pipeline_options: {pipeline_options}")
else:
_log.error(f"{asr_model} is not known")
raise ValueError(f"{asr_model} is not known")

audio_format_option = AudioFormatOption(
pipeline_cls=AsrPipeline,
pipeline_options=pipeline_options,
)
_log.info(f"ASR pipeline_options: {asr_pipeline_options}")

format_options = {
InputFormat.AUDIO: audio_format_option,
}
audio_format_option = AudioFormatOption(
pipeline_cls=AsrPipeline,
pipeline_options=asr_pipeline_options,
)
format_options[InputFormat.AUDIO] = audio_format_option

# Common options for all pipelines
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
# audio_pipeline_options.artifacts_path = artifacts_path
asr_pipeline_options.artifacts_path = artifacts_path

doc_converter = DocumentConverter(
allowed_formats=from_formats,
Expand Down
Loading