explosion · tomato-7 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 23, 2024
diff --git a/README.md b/README.md
@@ -63,8 +63,6 @@ for doc in layout.pipe(paths):
     print(doc._.layout)
 ```
 
-After you've processed the documents, you can [serialize](https://spacy.io/usage/saving-loading#docs) the structured `Doc` objects in spaCy's efficient binary format, so you don't have to re-run the resource-intensive conversion.
-
 spaCy also allows you to call the `nlp` object on an already created `Doc`, so you can easily apply a pipeline of components for [linguistic analysis](https://spacy.io/usage/linguistic-features) or [named entity recognition](https://spacy.io/usage/linguistic-features#named-entities), use [rule-based matching](https://spacy.io/usage/rule-based-matching) or anything else you can do with spaCy.
 
 ```python
@@ -110,6 +108,27 @@ def fix_text(text: str) -> str:
 layout = spaCyLayout(nlp, fix_text=fix_text)
 ```
 
+### Serialization
+
+After you've processed the documents, you can [serialize](https://spacy.io/usage/saving-loading#docs) the structured `Doc` objects in spaCy's efficient binary format, so you don't have to re-run the resource-intensive conversion.
+
+```python
+from spacy.tokens import DocBin
+
+docs = layout.pipe(["one.pdf", "two.pdf", "three.pdf"])
+doc_bin = DocBin(docs=docs, store_user_data=True)
+doc_bin.to_disk("./file.spacy")
+```
+
+> ⚠️ **Note on deserializing with extension attributes:** The custom extension attributes like `Doc._.layout` are currently registered when `spaCyLayout` is initialized. So if you're loading back `Doc` objects with layout information from a binary file, you'll need to initialize it so the custom attributes can be repopulated. We're planning on making this more elegant in an upcoming version.
+>
+> ```diff
+> + layout = spacyLayout(nlp)
+> doc_bin = DocBin(store_user_data=True).from_disk("./file.spacy")
+> docs = list(doc_bin.get_docs(nlp.vocab))
+> ```
+
+
 ## 🎛️ API
 
 ### Data and extension attributes
@@ -141,7 +160,7 @@ for span in doc.spans["layout"]:
 | Attribute | Type | Description |
 | --- | --- | --- |
 | `page_no` | `int` | The page number (1-indexed). |
-| `width` | `float` | Page with in pixels. |
+| `width` | `float` | Page width in pixels. |
 | `height` | `float` | Page height in pixels. |
 
 ### <kbd>dataclass</kbd> DocLayout
@@ -193,20 +212,85 @@ doc = layout("./starcraft.pdf")
 
 | Argument | Type | Description |
 | --- | --- | --- |
-| `source` | `str \| Path \| bytes` | Path of document to process or bytes. |
+| `source` | `str \| Path \| bytes \| DoclingDocument` | Path of document to process, bytes or already created `DoclingDocument`. |
 | **RETURNS** | `Doc` | The processed spaCy `Doc` object. |
 
 #### <kbd>method</kbd> `spaCyLayout.pipe`
 
-Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale.
+Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. The behavior of `as_tuples` works like it does in spaCy's [`Language.pipe`](https://spacy.io/api/language#pipe).
 
 ```python
 layout = spaCyLayout(nlp)
 paths = ["one.pdf", "two.pdf", "three.pdf", ...]
 docs = layout.pipe(paths)
 ```
 
+```python
+sources = [("one.pdf", {"id": 1}), ("two.pdf", {"id": 2})]
+for doc, context in layout.pipe(sources, as_tuples=True):
+    ...
+```
+
 | Argument | Type | Description |
 | --- | --- | --- |
-| `sources` | `Iterable[str \| Path \| bytes]` | Paths of documents to process or bytes. |
-| **YIELDS** | `Doc` | The processed spaCy `Doc` object. |
+| `sources` | `Iterable[str \| Path \| bytes] \| Iterable[tuple[str \| Path \| bytes, Any]]` | Paths of documents to process or bytes, or `(source, context)` tuples if `as_tuples` is set to `True`. |
+| `as_tuples` | `bool` | If set to `True`, inputs should be an iterable of `(source, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
+| **YIELDS** | `Doc \| tuple[Doc, Any]` | The processed spaCy `Doc` objects or `(doc, context)` tuples if `as_tuples` is set to `True`. |
+
+## 💡 Examples and code snippets
+
+This section includes further examples of what you can do with `spacy-layout`. If you have an example that could be a good fit, feel free to submit a [pull request](https://github.com/explosion/spacy-layout/pulls)!
+
+### Visualize a page and bounding boxes with matplotlib
+
+```python
+import pypdfium2 as pdfium
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+import spacy
+from spacy_layout import spaCyLayout
+
+DOCUMENT_PATH = "./document.pdf"
+
+# Load and convert the PDF page to an image
+pdf = pdfium.PdfDocument(DOCUMENT_PATH)
+page_image = pdf[2].render(scale=1)  # get page 3 (index 2)
+numpy_array = page_image.to_numpy()
+# Process document with spaCy
+nlp = spacy.blank("en")
+layout = spaCyLayout(nlp)
+doc = layout(DOCUMENT_PATH)
+
+# Get page 3 layout and sections
+page = doc._.pages[2]
+page_layout = doc._.layout.pages[2]
+# Create figure and axis with page dimensions
+fig, ax = plt.subplots(figsize=(12, 16))
+# Display the PDF image
+ax.imshow(numpy_array)
+# Add rectangles for each section's bounding box
+for section in page[1]:
+    # Create rectangle patch
+    rect = Rectangle(
+        (section._.layout.x, section._.layout.y),
+        section._.layout.width,
+        section._.layout.height,
+        fill=False,
+        color="blue",
+        linewidth=1,
+        alpha=0.5
+    )
+    ax.add_patch(rect)
+    # Add text label at top of box
+    ax.text(
+        section._.layout.x,
+        section._.layout.y,
+        section.label_,
+        fontsize=8,
+        color="red",
+        verticalalignment="bottom"
+    )
+
+ax.axis("off")  # hide axes
+plt.show()
+```
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [metadata]
-version = 0.0.9
+version = 0.0.12
 description = Use spaCy with PDFs, Word docs and other documents
 url = https://github.com/explosion/spacy-layout
 author = Explosion

diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py
@@ -1,10 +1,20 @@
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Iterable, Iterator
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Iterable,
+    Iterator,
+    Literal,
+    TypeVar,
+    cast,
+    overload,
+)
 
 import srsly
 from docling.datamodel.base_models import DocumentStream
 from docling.document_converter import DocumentConverter
+from docling_core.types.doc.document import DoclingDocument
 from docling_core.types.doc.labels import DocItemLabel
 from spacy.tokens import Doc, Span, SpanGroup
 
@@ -13,12 +23,15 @@
 
 if TYPE_CHECKING:
     from docling.datamodel.base_models import InputFormat
-    from docling.document_converter import ConversionResult, FormatOption
+    from docling.document_converter import FormatOption
     from pandas import DataFrame
     from spacy.language import Language
 
+# Type variable for contexts piped with documents
+_AnyContext = TypeVar("_AnyContext")
 
 TABLE_PLACEHOLDER = "TABLE"
+TABLE_ITEM_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
 
 # Register msgpack encoders and decoders for custom types
 srsly.msgpack_encoders.register("spacy-layout.dataclass", func=encode_obj)
@@ -68,37 +81,70 @@ def __init__(
         Span.set_extension(self.attrs.span_data, default=None, force=True)
         Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True)
 
-    def __call__(self, source: str | Path | bytes) -> Doc:
+    def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc:
         """Call parser on a path to create a spaCy Doc object."""
-        result = self.converter.convert(self._get_source(source))
+        if isinstance(source, DoclingDocument):
+            result = source
+        else:
+            result = self.converter.convert(self._get_source(source)).document
         return self._result_to_doc(result)
 
-    def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]:
+    @overload
+    def pipe(
+        self,
+        sources: Iterable[str | Path | bytes],
+        as_tuples: Literal[False] = ...,
+    ) -> Iterator[Doc]: ...
+
+    @overload
+    def pipe(
+        self,
+        sources: Iterable[tuple[str | Path | bytes, _AnyContext]],
+        as_tuples: Literal[True] = ...,
+    ) -> Iterator[tuple[Doc, _AnyContext]]: ...
+
+    def pipe(
+        self,
+        sources: (
+            Iterable[str | Path | bytes]
+            | Iterable[tuple[str | Path | bytes, _AnyContext]]
+        ),
+        as_tuples: bool = False,
+    ) -> Iterator[Doc] | Iterator[tuple[Doc, _AnyContext]]:
         """Process multiple documents and create spaCy Doc objects."""
-        data = (self._get_source(source) for source in sources)
-        results = self.converter.convert_all(data)
-        for result in results:
-            yield self._result_to_doc(result)
+        if as_tuples:
+            sources = cast(Iterable[tuple[str | Path | bytes, _AnyContext]], sources)
+            data = (self._get_source(source) for source, _ in sources)
+            contexts = (context for _, context in sources)
+            results = self.converter.convert_all(data)
+            for result, context in zip(results, contexts):
+                yield (self._result_to_doc(result.document), context)
+        else:
+            sources = cast(Iterable[str | Path | bytes], sources)
+            data = (self._get_source(source) for source in sources)
+            results = self.converter.convert_all(data)
+            for result in results:
+                yield self._result_to_doc(result.document)
 
     def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
         if isinstance(source, (str, Path)):
             return source
         return DocumentStream(name="source", stream=BytesIO(source))
 
-    def _result_to_doc(self, result: "ConversionResult") -> Doc:
+    def _result_to_doc(self, document: DoclingDocument) -> Doc:
         inputs = []
         pages = {
-            (page.page_no + 1): PageLayout(
-                page_no=page.page_no + 1,
+            (page.page_no): PageLayout(
+                page_no=page.page_no,
                 width=page.size.width if page.size else 0,
                 height=page.size.height if page.size else 0,
             )
-            for page in result.pages
+            for _, page in document.pages.items()
         }
-        text_items = {item.self_ref: item for item in result.document.texts}
-        table_items = {item.self_ref: item for item in result.document.tables}
+        text_items = {item.self_ref: item for item in document.texts}
+        table_items = {item.self_ref: item for item in document.tables}
         # We want to iterate over the tree to get different elements in order
-        for node, _ in result.document.iterate_items():
+        for node, _ in document.iterate_items():
             if node.self_ref in text_items:
                 item = text_items[node.self_ref]
                 text = item.text
@@ -116,7 +162,7 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc:
                 inputs.append((table_text, item))
         doc = self._texts_to_doc(inputs, pages)
         doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()]))
-        doc._.set(self.attrs.doc_markdown, result.document.export_to_markdown())
+        doc._.set(self.attrs.doc_markdown, document.export_to_markdown())
         return doc
 
     def _texts_to_doc(
@@ -147,7 +193,7 @@ def _texts_to_doc(
             span = Span(doc, start=start, end=end, label=item.label, span_id=i)
             layout = self._get_span_layout(item, pages)
             span._.set(self.attrs.span_layout, layout)
-            if item.label == DocItemLabel.TABLE:
+            if item.label in TABLE_ITEM_LABELS:
                 span._.set(self.attrs.span_data, item.export_to_dataframe())
             spans.append(span)
         doc.spans[self.attrs.span_group] = SpanGroup(
@@ -191,5 +237,5 @@ def get_tables(self, doc: Doc) -> list[Span]:
         return [
             span
             for span in doc.spans[self.attrs.span_group]
-            if span.label_ == DocItemLabel.TABLE
+            if span.label_ in TABLE_ITEM_LABELS
         ]
diff --git a/tests/data/table_document_index.pdf b/tests/data/table_document_index.pdf
diff --git a/tests/test_general.py b/tests/test_general.py
@@ -8,6 +8,7 @@
 from pandas import DataFrame
 from pandas.testing import assert_frame_equal
 from spacy.tokens import DocBin
+import pandas as pd
 
 from spacy_layout import spaCyLayout
 from spacy_layout.layout import TABLE_PLACEHOLDER, get_bounding_box
@@ -18,6 +19,7 @@
 DOCX_SIMPLE = Path(__file__).parent / "data" / "simple.docx"
 PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read()
 PDF_TABLE = Path(__file__).parent / "data" / "table.pdf"
+PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf"
 
 
 @pytest.fixture
@@ -41,6 +43,21 @@ def test_general(path, nlp, span_labels):
         assert isinstance(span._.get(layout.attrs.span_layout), SpanLayout)
 
 
+@pytest.mark.parametrize("path, pg_no", [(PDF_STARCRAFT, 6), (PDF_SIMPLE, 1)])
+def test_pages(path, pg_no, nlp):
+    layout = spaCyLayout(nlp)
+    doc = layout(path)
+    # This should not raise a KeyError when accessing `pages` dict
+    # Key Error would mean a mismatched pagination on document layout and span layout
+    result = layout.get_pages(doc)
+    assert len(result) == pg_no
+    assert result[0][0].page_no == 1
+    if pg_no == 6:  # there should be 16 or 18 spans on the pg_no 1
+        assert len(result[0][1]) in (16, 18)
+    elif pg_no == 1:  # there should be 4 spans on pg_no 1
+        assert len(result[0][1]) == 4
+
+
 @pytest.mark.parametrize("path", [PDF_SIMPLE, DOCX_SIMPLE])
 @pytest.mark.parametrize("separator", ["\n\n", ""])
 def test_simple(path, separator, nlp):
@@ -66,6 +83,15 @@ def fix_text(text):
     assert doc.text.startswith("LOREM ipsum dolor sit amet")
 
 
+def test_simple_pipe_as_tuples(nlp):
+    layout = spaCyLayout(nlp)
+    data = [(PDF_SIMPLE, "pdf"), (DOCX_SIMPLE, "docx")]
+    result = list(layout.pipe(data, as_tuples=True))
+    for doc, _ in result:
+        assert len(doc.spans[layout.attrs.span_group]) == 4
+    assert [context for _, context in result] == ["pdf", "docx"]
+
+
 def test_table(nlp):
     layout = spaCyLayout(nlp)
     doc = layout(PDF_TABLE)
@@ -95,6 +121,23 @@ def test_table(nlp):
     assert markdown in doc._.get(layout.attrs.doc_markdown)
 
 
+def test_table_index(nlp):
+    layout = spaCyLayout(nlp)
+    doc = layout(PDF_INDEX)
+    assert len(doc._.get(layout.attrs.doc_tables)) == 3
+    table = doc._.get(layout.attrs.doc_tables)[0]
+    assert table.text == TABLE_PLACEHOLDER
+    assert table.label_ == DocItemLabel.DOCUMENT_INDEX.value
+
+    # Check that each document_index table has a dataframe
+    document_index_tables = [span for span in doc._.get(
+        layout.attrs.doc_tables) if span.label_ == DocItemLabel.DOCUMENT_INDEX.value]
+    for table in document_index_tables:
+        assert table._.data is not None, "Table data not available"
+        assert isinstance(
+            table._.data, pd.DataFrame), "Table data is not a DataFrame"
+
+
 def test_table_placeholder(nlp):
     def display_table(df):
         return f"Table with columns: {', '.join(df.columns.tolist())}"