Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 6 additions & 12 deletions .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,8 @@ jobs:
- name: Install System Dependencies
run: |
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
sudo apt-get -qq update
fi
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
sudo apt-get -qq update
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
- name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
Expand Down Expand Up @@ -149,10 +147,8 @@ jobs:
- name: Install System Dependencies
run: |
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
sudo apt-get -qq update
fi
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
sudo apt-get -qq update
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
- name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
Expand Down Expand Up @@ -223,10 +219,8 @@ jobs:
- name: Install System Dependencies
run: |
if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then
sudo apt-get -qq update
fi
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
sudo apt-get -qq update
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
- name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
Expand Down
131 changes: 131 additions & 0 deletions docling/backend/docx/drawingml/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
import shutil
import subprocess
from pathlib import Path
from tempfile import mkdtemp
from typing import Callable, Optional

import pypdfium2
from docx.document import Document
from PIL import Image, ImageChops


def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
"""Return the libreoffice cmd and optionally test it."""

libreoffice_cmd = (
shutil.which("libreoffice")
or shutil.which("soffice")
or (
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
else None
)
)

if raise_if_unavailable:
if libreoffice_cmd is None:
raise RuntimeError("Libreoffice not found")

# The following test will raise if the libreoffice_cmd cannot be used
subprocess.run(
[
libreoffice_cmd,
"-h",
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True,
)

return libreoffice_cmd


def get_docx_to_pdf_converter() -> Optional[Callable]:
"""
Detects the best available DOCX to PDF tool and returns a conversion function.
The returned function accepts (input_path, output_path).
Returns None if no tool is available.
"""

# Try LibreOffice
libreoffice_cmd = get_libreoffice_cmd()

if libreoffice_cmd:

def convert_with_libreoffice(input_path, output_path):
subprocess.run(
[
libreoffice_cmd,
"--headless",
"--convert-to",
"pdf",
"--outdir",
os.path.dirname(output_path),
input_path,
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True,
)

expected_output = os.path.join(
os.path.dirname(output_path),
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
)
if expected_output != output_path:
os.rename(expected_output, output_path)

return convert_with_libreoffice

## Space for other DOCX to PDF converters if available

# No tools found
return None


def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
if bg_color is None:
bg_color = image.getpixel((0, 0))

bg = Image.new(image.mode, image.size, bg_color)
diff = ImageChops.difference(image, bg)
bbox = diff.getbbox()

if bbox:
left, upper, right, lower = bbox
left = max(0, left - padding)
upper = max(0, upper - padding)
right = min(image.width, right + padding)
lower = min(image.height, lower + padding)
return image.crop((left, upper, right, lower))
else:
return image


def get_pil_from_dml_docx(
docx: Document, converter: Optional[Callable]
) -> Optional[Image.Image]:
if converter is None:
return None

temp_dir = Path(mkdtemp())
temp_docx = Path(temp_dir / "drawing_only.docx")
temp_pdf = Path(temp_dir / "drawing_only.pdf")

# 1) Save docx temporarily
docx.save(str(temp_docx))

# 2) Export to PDF
converter(temp_docx, temp_pdf)

# 3) Load PDF as PNG
pdf = pypdfium2.PdfDocument(temp_pdf)
page = pdf[0]
image = crop_whitespace(page.render(scale=2).to_pil())
page.close()
pdf.close()

shutil.rmtree(temp_dir, ignore_errors=True)

return image
99 changes: 87 additions & 12 deletions docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
import re
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import Any, List, Optional, Union
from typing import Any, Callable, List, Optional, Union

from docling_core.types.doc import (
DocItemLabel,
Expand Down Expand Up @@ -33,6 +34,11 @@
from typing_extensions import override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.docx.drawingml.utils import (
get_docx_to_pdf_converter,
get_libreoffice_cmd,
get_pil_from_dml_docx,
)
from docling.backend.docx.latex.omml import oMath2Latex
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
Expand Down Expand Up @@ -64,6 +70,9 @@ def __init__(
self.equation_bookends: str = "<eq>{EQ}</eq>"
# Track processed textbox elements to avoid duplication
self.processed_textbox_elements: List[int] = []
self.docx_to_pdf_converter: Optional[Callable] = None
self.docx_to_pdf_converter_init = False
self.display_drawingml_warning = True

for i in range(-1, self.max_levels):
self.parents[i] = None
Expand All @@ -80,18 +89,11 @@ def __init__(
"indents": [None],
}

self.docx_obj = None
try:
if isinstance(self.path_or_stream, BytesIO):
self.docx_obj = Document(self.path_or_stream)
elif isinstance(self.path_or_stream, Path):
self.docx_obj = Document(str(self.path_or_stream))

self.docx_obj = self.load_msword_file(
path_or_stream=self.path_or_stream, document_hash=self.document_hash
)
if self.docx_obj:
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
) from e

@override
def is_valid(self) -> bool:
Expand Down Expand Up @@ -139,6 +141,22 @@ def convert(self) -> DoclingDocument:
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)

@staticmethod
def load_msword_file(
path_or_stream: Union[BytesIO, Path], document_hash: str
) -> DocxDocument:
try:
if isinstance(path_or_stream, BytesIO):
return Document(path_or_stream)
elif isinstance(path_or_stream, Path):
return Document(str(path_or_stream))
else:
return None
except Exception as e:
raise RuntimeError(
f"MsWordDocumentBackend could not load document with hash {document_hash}"
) from e

def _update_history(
self,
name: str,
Expand Down Expand Up @@ -195,6 +213,7 @@ def _walk_linear(
}
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
drawing_blip = xpath_expr(element)
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)

# Check for textbox content - check multiple textbox formats
# Only process if the element hasn't been processed before
Expand Down Expand Up @@ -274,6 +293,26 @@ def _walk_linear(
):
te1 = self._handle_text_elements(element, docx_obj, doc)
added_elements.extend(te1)
# Check for DrawingML elements
elif drawingml_els:
if (
self.docx_to_pdf_converter is None
and self.docx_to_pdf_converter_init is False
):
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
self.docx_to_pdf_converter_init = True

if self.docx_to_pdf_converter is None:
if self.display_drawingml_warning:
if self.docx_to_pdf_converter is None:
_log.warning(
"Found DrawingML elements in document, but no DOCX to PDF converters. "
"If you want these exported, make sure you have "
"LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
)
self.display_drawingml_warning = False
else:
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
# Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
Expand Down Expand Up @@ -1381,3 +1420,39 @@ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
)
elem_ref.append(p3.get_ref())
return elem_ref

def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
# 1) Make an empty copy of the original document
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
body = dml_doc._element.body
for child in list(body):
body.remove(child)

# 2) Add DrawingML to empty document
new_para = dml_doc.add_paragraph()
new_r = new_para.add_run()
for dml in drawingml_els:
new_r._r.append(deepcopy(dml))

# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
level = self._get_level()
try:
pil_image = get_pil_from_dml_docx(
dml_doc, converter=self.docx_to_pdf_converter
)
if pil_image is None:
raise UnidentifiedImageError

doc.add_picture(
parent=self.parents[level - 1],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except (UnidentifiedImageError, OSError):
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
doc.add_picture(
parent=self.parents[level - 1],
caption=None,
)

return
Binary file added tests/data/docx/drawingml.docx
Binary file not shown.
13 changes: 13 additions & 0 deletions tests/data/groundtruth/docling_v2/drawingml.docx.itxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group textbox
item-2 at level 2: text: Text 2
item-3 at level 2: text: Text 1
item-4 at level 1: picture
item-5 at level 1: text:
item-6 at level 1: text:
item-7 at level 1: text:
item-8 at level 1: text:
item-9 at level 1: text:
item-10 at level 1: text:
item-11 at level 1: text:
item-12 at level 1: picture
Loading