diff --git a/README.md b/README.md
index 55941c9..32ff751 100644
--- a/README.md
+++ b/README.md
@@ -63,8 +63,6 @@ for doc in layout.pipe(paths):
print(doc._.layout)
```
-After you've processed the documents, you can [serialize](https://spacy.io/usage/saving-loading#docs) the structured `Doc` objects in spaCy's efficient binary format, so you don't have to re-run the resource-intensive conversion.
-
spaCy also allows you to call the `nlp` object on an already created `Doc`, so you can easily apply a pipeline of components for [linguistic analysis](https://spacy.io/usage/linguistic-features) or [named entity recognition](https://spacy.io/usage/linguistic-features#named-entities), use [rule-based matching](https://spacy.io/usage/rule-based-matching) or anything else you can do with spaCy.
```python
@@ -110,6 +108,27 @@ def fix_text(text: str) -> str:
layout = spaCyLayout(nlp, fix_text=fix_text)
```
+### Serialization
+
+After you've processed the documents, you can [serialize](https://spacy.io/usage/saving-loading#docs) the structured `Doc` objects in spaCy's efficient binary format, so you don't have to re-run the resource-intensive conversion.
+
+```python
+from spacy.tokens import DocBin
+
+docs = layout.pipe(["one.pdf", "two.pdf", "three.pdf"])
+doc_bin = DocBin(docs=docs, store_user_data=True)
+doc_bin.to_disk("./file.spacy")
+```
+
+> ⚠️ **Note on deserializing with extension attributes:** The custom extension attributes like `Doc._.layout` are currently registered when `spaCyLayout` is initialized. So if you're loading back `Doc` objects with layout information from a binary file, you'll need to initialize it so the custom attributes can be repopulated. We're planning on making this more elegant in an upcoming version.
+>
+> ```diff
+> + layout = spacyLayout(nlp)
+> doc_bin = DocBin(store_user_data=True).from_disk("./file.spacy")
+> docs = list(doc_bin.get_docs(nlp.vocab))
+> ```
+
+
## 🎛️ API
### Data and extension attributes
@@ -141,7 +160,7 @@ for span in doc.spans["layout"]:
| Attribute | Type | Description |
| --- | --- | --- |
| `page_no` | `int` | The page number (1-indexed). |
-| `width` | `float` | Page with in pixels. |
+| `width` | `float` | Page width in pixels. |
| `height` | `float` | Page height in pixels. |
### dataclass DocLayout
@@ -193,12 +212,12 @@ doc = layout("./starcraft.pdf")
| Argument | Type | Description |
| --- | --- | --- |
-| `source` | `str \| Path \| bytes` | Path of document to process or bytes. |
+| `source` | `str \| Path \| bytes \| DoclingDocument` | Path of document to process, bytes or already created `DoclingDocument`. |
| **RETURNS** | `Doc` | The processed spaCy `Doc` object. |
#### method `spaCyLayout.pipe`
-Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale.
+Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. The behavior of `as_tuples` works like it does in spaCy's [`Language.pipe`](https://spacy.io/api/language#pipe).
```python
layout = spaCyLayout(nlp)
@@ -206,7 +225,72 @@ paths = ["one.pdf", "two.pdf", "three.pdf", ...]
docs = layout.pipe(paths)
```
+```python
+sources = [("one.pdf", {"id": 1}), ("two.pdf", {"id": 2})]
+for doc, context in layout.pipe(sources, as_tuples=True):
+ ...
+```
+
| Argument | Type | Description |
| --- | --- | --- |
-| `sources` | `Iterable[str \| Path \| bytes]` | Paths of documents to process or bytes. |
-| **YIELDS** | `Doc` | The processed spaCy `Doc` object. |
+| `sources` | `Iterable[str \| Path \| bytes] \| Iterable[tuple[str \| Path \| bytes, Any]]` | Paths of documents to process or bytes, or `(source, context)` tuples if `as_tuples` is set to `True`. |
+| `as_tuples` | `bool` | If set to `True`, inputs should be an iterable of `(source, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
+| **YIELDS** | `Doc \| tuple[Doc, Any]` | The processed spaCy `Doc` objects or `(doc, context)` tuples if `as_tuples` is set to `True`. |
+
+## 💡 Examples and code snippets
+
+This section includes further examples of what you can do with `spacy-layout`. If you have an example that could be a good fit, feel free to submit a [pull request](https://github.com/explosion/spacy-layout/pulls)!
+
+### Visualize a page and bounding boxes with matplotlib
+
+```python
+import pypdfium2 as pdfium
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+import spacy
+from spacy_layout import spaCyLayout
+
+DOCUMENT_PATH = "./document.pdf"
+
+# Load and convert the PDF page to an image
+pdf = pdfium.PdfDocument(DOCUMENT_PATH)
+page_image = pdf[2].render(scale=1) # get page 3 (index 2)
+numpy_array = page_image.to_numpy()
+# Process document with spaCy
+nlp = spacy.blank("en")
+layout = spaCyLayout(nlp)
+doc = layout(DOCUMENT_PATH)
+
+# Get page 3 layout and sections
+page = doc._.pages[2]
+page_layout = doc._.layout.pages[2]
+# Create figure and axis with page dimensions
+fig, ax = plt.subplots(figsize=(12, 16))
+# Display the PDF image
+ax.imshow(numpy_array)
+# Add rectangles for each section's bounding box
+for section in page[1]:
+ # Create rectangle patch
+ rect = Rectangle(
+ (section._.layout.x, section._.layout.y),
+ section._.layout.width,
+ section._.layout.height,
+ fill=False,
+ color="blue",
+ linewidth=1,
+ alpha=0.5
+ )
+ ax.add_patch(rect)
+ # Add text label at top of box
+ ax.text(
+ section._.layout.x,
+ section._.layout.y,
+ section.label_,
+ fontsize=8,
+ color="red",
+ verticalalignment="bottom"
+ )
+
+ax.axis("off") # hide axes
+plt.show()
+```
diff --git a/setup.cfg b/setup.cfg
index 7701d8e..afb19e6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[metadata]
-version = 0.0.9
+version = 0.0.12
description = Use spaCy with PDFs, Word docs and other documents
url = https://github.com/explosion/spacy-layout
author = Explosion
diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py
index 4577180..787f31f 100644
--- a/spacy_layout/layout.py
+++ b/spacy_layout/layout.py
@@ -1,10 +1,20 @@
from io import BytesIO
from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Iterable, Iterator
+from typing import (
+ TYPE_CHECKING,
+ Callable,
+ Iterable,
+ Iterator,
+ Literal,
+ TypeVar,
+ cast,
+ overload,
+)
import srsly
from docling.datamodel.base_models import DocumentStream
from docling.document_converter import DocumentConverter
+from docling_core.types.doc.document import DoclingDocument
from docling_core.types.doc.labels import DocItemLabel
from spacy.tokens import Doc, Span, SpanGroup
@@ -13,12 +23,15 @@
if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
- from docling.document_converter import ConversionResult, FormatOption
+ from docling.document_converter import FormatOption
from pandas import DataFrame
from spacy.language import Language
+# Type variable for contexts piped with documents
+_AnyContext = TypeVar("_AnyContext")
TABLE_PLACEHOLDER = "TABLE"
+TABLE_ITEM_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
# Register msgpack encoders and decoders for custom types
srsly.msgpack_encoders.register("spacy-layout.dataclass", func=encode_obj)
@@ -68,37 +81,70 @@ def __init__(
Span.set_extension(self.attrs.span_data, default=None, force=True)
Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True)
- def __call__(self, source: str | Path | bytes) -> Doc:
+ def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc:
"""Call parser on a path to create a spaCy Doc object."""
- result = self.converter.convert(self._get_source(source))
+ if isinstance(source, DoclingDocument):
+ result = source
+ else:
+ result = self.converter.convert(self._get_source(source)).document
return self._result_to_doc(result)
- def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]:
+ @overload
+ def pipe(
+ self,
+ sources: Iterable[str | Path | bytes],
+ as_tuples: Literal[False] = ...,
+ ) -> Iterator[Doc]: ...
+
+ @overload
+ def pipe(
+ self,
+ sources: Iterable[tuple[str | Path | bytes, _AnyContext]],
+ as_tuples: Literal[True] = ...,
+ ) -> Iterator[tuple[Doc, _AnyContext]]: ...
+
+ def pipe(
+ self,
+ sources: (
+ Iterable[str | Path | bytes]
+ | Iterable[tuple[str | Path | bytes, _AnyContext]]
+ ),
+ as_tuples: bool = False,
+ ) -> Iterator[Doc] | Iterator[tuple[Doc, _AnyContext]]:
"""Process multiple documents and create spaCy Doc objects."""
- data = (self._get_source(source) for source in sources)
- results = self.converter.convert_all(data)
- for result in results:
- yield self._result_to_doc(result)
+ if as_tuples:
+ sources = cast(Iterable[tuple[str | Path | bytes, _AnyContext]], sources)
+ data = (self._get_source(source) for source, _ in sources)
+ contexts = (context for _, context in sources)
+ results = self.converter.convert_all(data)
+ for result, context in zip(results, contexts):
+ yield (self._result_to_doc(result.document), context)
+ else:
+ sources = cast(Iterable[str | Path | bytes], sources)
+ data = (self._get_source(source) for source in sources)
+ results = self.converter.convert_all(data)
+ for result in results:
+ yield self._result_to_doc(result.document)
def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
if isinstance(source, (str, Path)):
return source
return DocumentStream(name="source", stream=BytesIO(source))
- def _result_to_doc(self, result: "ConversionResult") -> Doc:
+ def _result_to_doc(self, document: DoclingDocument) -> Doc:
inputs = []
pages = {
- (page.page_no + 1): PageLayout(
- page_no=page.page_no + 1,
+ (page.page_no): PageLayout(
+ page_no=page.page_no,
width=page.size.width if page.size else 0,
height=page.size.height if page.size else 0,
)
- for page in result.pages
+ for _, page in document.pages.items()
}
- text_items = {item.self_ref: item for item in result.document.texts}
- table_items = {item.self_ref: item for item in result.document.tables}
+ text_items = {item.self_ref: item for item in document.texts}
+ table_items = {item.self_ref: item for item in document.tables}
# We want to iterate over the tree to get different elements in order
- for node, _ in result.document.iterate_items():
+ for node, _ in document.iterate_items():
if node.self_ref in text_items:
item = text_items[node.self_ref]
text = item.text
@@ -116,7 +162,7 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc:
inputs.append((table_text, item))
doc = self._texts_to_doc(inputs, pages)
doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()]))
- doc._.set(self.attrs.doc_markdown, result.document.export_to_markdown())
+ doc._.set(self.attrs.doc_markdown, document.export_to_markdown())
return doc
def _texts_to_doc(
@@ -147,7 +193,7 @@ def _texts_to_doc(
span = Span(doc, start=start, end=end, label=item.label, span_id=i)
layout = self._get_span_layout(item, pages)
span._.set(self.attrs.span_layout, layout)
- if item.label == DocItemLabel.TABLE:
+ if item.label in TABLE_ITEM_LABELS:
span._.set(self.attrs.span_data, item.export_to_dataframe())
spans.append(span)
doc.spans[self.attrs.span_group] = SpanGroup(
@@ -191,5 +237,5 @@ def get_tables(self, doc: Doc) -> list[Span]:
return [
span
for span in doc.spans[self.attrs.span_group]
- if span.label_ == DocItemLabel.TABLE
+ if span.label_ in TABLE_ITEM_LABELS
]
diff --git a/tests/data/table_document_index.pdf b/tests/data/table_document_index.pdf
new file mode 100644
index 0000000..cdfa135
Binary files /dev/null and b/tests/data/table_document_index.pdf differ
diff --git a/tests/test_general.py b/tests/test_general.py
index 9813c8b..36d6c6b 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -8,6 +8,7 @@
from pandas import DataFrame
from pandas.testing import assert_frame_equal
from spacy.tokens import DocBin
+import pandas as pd
from spacy_layout import spaCyLayout
from spacy_layout.layout import TABLE_PLACEHOLDER, get_bounding_box
@@ -18,6 +19,7 @@
DOCX_SIMPLE = Path(__file__).parent / "data" / "simple.docx"
PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read()
PDF_TABLE = Path(__file__).parent / "data" / "table.pdf"
+PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf"
@pytest.fixture
@@ -41,6 +43,21 @@ def test_general(path, nlp, span_labels):
assert isinstance(span._.get(layout.attrs.span_layout), SpanLayout)
+@pytest.mark.parametrize("path, pg_no", [(PDF_STARCRAFT, 6), (PDF_SIMPLE, 1)])
+def test_pages(path, pg_no, nlp):
+ layout = spaCyLayout(nlp)
+ doc = layout(path)
+ # This should not raise a KeyError when accessing `pages` dict
+ # Key Error would mean a mismatched pagination on document layout and span layout
+ result = layout.get_pages(doc)
+ assert len(result) == pg_no
+ assert result[0][0].page_no == 1
+ if pg_no == 6: # there should be 16 or 18 spans on the pg_no 1
+ assert len(result[0][1]) in (16, 18)
+ elif pg_no == 1: # there should be 4 spans on pg_no 1
+ assert len(result[0][1]) == 4
+
+
@pytest.mark.parametrize("path", [PDF_SIMPLE, DOCX_SIMPLE])
@pytest.mark.parametrize("separator", ["\n\n", ""])
def test_simple(path, separator, nlp):
@@ -66,6 +83,15 @@ def fix_text(text):
assert doc.text.startswith("LOREM ipsum dolor sit amet")
+def test_simple_pipe_as_tuples(nlp):
+ layout = spaCyLayout(nlp)
+ data = [(PDF_SIMPLE, "pdf"), (DOCX_SIMPLE, "docx")]
+ result = list(layout.pipe(data, as_tuples=True))
+ for doc, _ in result:
+ assert len(doc.spans[layout.attrs.span_group]) == 4
+ assert [context for _, context in result] == ["pdf", "docx"]
+
+
def test_table(nlp):
layout = spaCyLayout(nlp)
doc = layout(PDF_TABLE)
@@ -95,6 +121,23 @@ def test_table(nlp):
assert markdown in doc._.get(layout.attrs.doc_markdown)
+def test_table_index(nlp):
+ layout = spaCyLayout(nlp)
+ doc = layout(PDF_INDEX)
+ assert len(doc._.get(layout.attrs.doc_tables)) == 3
+ table = doc._.get(layout.attrs.doc_tables)[0]
+ assert table.text == TABLE_PLACEHOLDER
+ assert table.label_ == DocItemLabel.DOCUMENT_INDEX.value
+
+ # Check that each document_index table has a dataframe
+ document_index_tables = [span for span in doc._.get(
+ layout.attrs.doc_tables) if span.label_ == DocItemLabel.DOCUMENT_INDEX.value]
+ for table in document_index_tables:
+ assert table._.data is not None, "Table data not available"
+ assert isinstance(
+ table._.data, pd.DataFrame), "Table data is not a DataFrame"
+
+
def test_table_placeholder(nlp):
def display_table(df):
return f"Table with columns: {', '.join(df.columns.tolist())}"