diff --git a/README.md b/README.md index 55941c9..32ff751 100644 --- a/README.md +++ b/README.md @@ -63,8 +63,6 @@ for doc in layout.pipe(paths): print(doc._.layout) ``` -After you've processed the documents, you can [serialize](https://spacy.io/usage/saving-loading#docs) the structured `Doc` objects in spaCy's efficient binary format, so you don't have to re-run the resource-intensive conversion. - spaCy also allows you to call the `nlp` object on an already created `Doc`, so you can easily apply a pipeline of components for [linguistic analysis](https://spacy.io/usage/linguistic-features) or [named entity recognition](https://spacy.io/usage/linguistic-features#named-entities), use [rule-based matching](https://spacy.io/usage/rule-based-matching) or anything else you can do with spaCy. ```python @@ -110,6 +108,27 @@ def fix_text(text: str) -> str: layout = spaCyLayout(nlp, fix_text=fix_text) ``` +### Serialization + +After you've processed the documents, you can [serialize](https://spacy.io/usage/saving-loading#docs) the structured `Doc` objects in spaCy's efficient binary format, so you don't have to re-run the resource-intensive conversion. + +```python +from spacy.tokens import DocBin + +docs = layout.pipe(["one.pdf", "two.pdf", "three.pdf"]) +doc_bin = DocBin(docs=docs, store_user_data=True) +doc_bin.to_disk("./file.spacy") +``` + +> ⚠️ **Note on deserializing with extension attributes:** The custom extension attributes like `Doc._.layout` are currently registered when `spaCyLayout` is initialized. So if you're loading back `Doc` objects with layout information from a binary file, you'll need to initialize it so the custom attributes can be repopulated. We're planning on making this more elegant in an upcoming version. +> +> ```diff +> + layout = spacyLayout(nlp) +> doc_bin = DocBin(store_user_data=True).from_disk("./file.spacy") +> docs = list(doc_bin.get_docs(nlp.vocab)) +> ``` + + ## 🎛️ API ### Data and extension attributes @@ -141,7 +160,7 @@ for span in doc.spans["layout"]: | Attribute | Type | Description | | --- | --- | --- | | `page_no` | `int` | The page number (1-indexed). | -| `width` | `float` | Page with in pixels. | +| `width` | `float` | Page width in pixels. | | `height` | `float` | Page height in pixels. | ### dataclass DocLayout @@ -193,12 +212,12 @@ doc = layout("./starcraft.pdf") | Argument | Type | Description | | --- | --- | --- | -| `source` | `str \| Path \| bytes` | Path of document to process or bytes. | +| `source` | `str \| Path \| bytes \| DoclingDocument` | Path of document to process, bytes or already created `DoclingDocument`. | | **RETURNS** | `Doc` | The processed spaCy `Doc` object. | #### method `spaCyLayout.pipe` -Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. +Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. The behavior of `as_tuples` works like it does in spaCy's [`Language.pipe`](https://spacy.io/api/language#pipe). ```python layout = spaCyLayout(nlp) @@ -206,7 +225,72 @@ paths = ["one.pdf", "two.pdf", "three.pdf", ...] docs = layout.pipe(paths) ``` +```python +sources = [("one.pdf", {"id": 1}), ("two.pdf", {"id": 2})] +for doc, context in layout.pipe(sources, as_tuples=True): + ... +``` + | Argument | Type | Description | | --- | --- | --- | -| `sources` | `Iterable[str \| Path \| bytes]` | Paths of documents to process or bytes. | -| **YIELDS** | `Doc` | The processed spaCy `Doc` object. | +| `sources` | `Iterable[str \| Path \| bytes] \| Iterable[tuple[str \| Path \| bytes, Any]]` | Paths of documents to process or bytes, or `(source, context)` tuples if `as_tuples` is set to `True`. | +| `as_tuples` | `bool` | If set to `True`, inputs should be an iterable of `(source, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | +| **YIELDS** | `Doc \| tuple[Doc, Any]` | The processed spaCy `Doc` objects or `(doc, context)` tuples if `as_tuples` is set to `True`. | + +## 💡 Examples and code snippets + +This section includes further examples of what you can do with `spacy-layout`. If you have an example that could be a good fit, feel free to submit a [pull request](https://github.com/explosion/spacy-layout/pulls)! + +### Visualize a page and bounding boxes with matplotlib + +```python +import pypdfium2 as pdfium +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle +import spacy +from spacy_layout import spaCyLayout + +DOCUMENT_PATH = "./document.pdf" + +# Load and convert the PDF page to an image +pdf = pdfium.PdfDocument(DOCUMENT_PATH) +page_image = pdf[2].render(scale=1) # get page 3 (index 2) +numpy_array = page_image.to_numpy() +# Process document with spaCy +nlp = spacy.blank("en") +layout = spaCyLayout(nlp) +doc = layout(DOCUMENT_PATH) + +# Get page 3 layout and sections +page = doc._.pages[2] +page_layout = doc._.layout.pages[2] +# Create figure and axis with page dimensions +fig, ax = plt.subplots(figsize=(12, 16)) +# Display the PDF image +ax.imshow(numpy_array) +# Add rectangles for each section's bounding box +for section in page[1]: + # Create rectangle patch + rect = Rectangle( + (section._.layout.x, section._.layout.y), + section._.layout.width, + section._.layout.height, + fill=False, + color="blue", + linewidth=1, + alpha=0.5 + ) + ax.add_patch(rect) + # Add text label at top of box + ax.text( + section._.layout.x, + section._.layout.y, + section.label_, + fontsize=8, + color="red", + verticalalignment="bottom" + ) + +ax.axis("off") # hide axes +plt.show() +``` diff --git a/setup.cfg b/setup.cfg index 7701d8e..afb19e6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 0.0.9 +version = 0.0.12 description = Use spaCy with PDFs, Word docs and other documents url = https://github.com/explosion/spacy-layout author = Explosion diff --git a/spacy_layout/layout.py b/spacy_layout/layout.py index 4577180..787f31f 100644 --- a/spacy_layout/layout.py +++ b/spacy_layout/layout.py @@ -1,10 +1,20 @@ from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Callable, Iterable, Iterator +from typing import ( + TYPE_CHECKING, + Callable, + Iterable, + Iterator, + Literal, + TypeVar, + cast, + overload, +) import srsly from docling.datamodel.base_models import DocumentStream from docling.document_converter import DocumentConverter +from docling_core.types.doc.document import DoclingDocument from docling_core.types.doc.labels import DocItemLabel from spacy.tokens import Doc, Span, SpanGroup @@ -13,12 +23,15 @@ if TYPE_CHECKING: from docling.datamodel.base_models import InputFormat - from docling.document_converter import ConversionResult, FormatOption + from docling.document_converter import FormatOption from pandas import DataFrame from spacy.language import Language +# Type variable for contexts piped with documents +_AnyContext = TypeVar("_AnyContext") TABLE_PLACEHOLDER = "TABLE" +TABLE_ITEM_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX] # Register msgpack encoders and decoders for custom types srsly.msgpack_encoders.register("spacy-layout.dataclass", func=encode_obj) @@ -68,37 +81,70 @@ def __init__( Span.set_extension(self.attrs.span_data, default=None, force=True) Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True) - def __call__(self, source: str | Path | bytes) -> Doc: + def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc: """Call parser on a path to create a spaCy Doc object.""" - result = self.converter.convert(self._get_source(source)) + if isinstance(source, DoclingDocument): + result = source + else: + result = self.converter.convert(self._get_source(source)).document return self._result_to_doc(result) - def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]: + @overload + def pipe( + self, + sources: Iterable[str | Path | bytes], + as_tuples: Literal[False] = ..., + ) -> Iterator[Doc]: ... + + @overload + def pipe( + self, + sources: Iterable[tuple[str | Path | bytes, _AnyContext]], + as_tuples: Literal[True] = ..., + ) -> Iterator[tuple[Doc, _AnyContext]]: ... + + def pipe( + self, + sources: ( + Iterable[str | Path | bytes] + | Iterable[tuple[str | Path | bytes, _AnyContext]] + ), + as_tuples: bool = False, + ) -> Iterator[Doc] | Iterator[tuple[Doc, _AnyContext]]: """Process multiple documents and create spaCy Doc objects.""" - data = (self._get_source(source) for source in sources) - results = self.converter.convert_all(data) - for result in results: - yield self._result_to_doc(result) + if as_tuples: + sources = cast(Iterable[tuple[str | Path | bytes, _AnyContext]], sources) + data = (self._get_source(source) for source, _ in sources) + contexts = (context for _, context in sources) + results = self.converter.convert_all(data) + for result, context in zip(results, contexts): + yield (self._result_to_doc(result.document), context) + else: + sources = cast(Iterable[str | Path | bytes], sources) + data = (self._get_source(source) for source in sources) + results = self.converter.convert_all(data) + for result in results: + yield self._result_to_doc(result.document) def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream: if isinstance(source, (str, Path)): return source return DocumentStream(name="source", stream=BytesIO(source)) - def _result_to_doc(self, result: "ConversionResult") -> Doc: + def _result_to_doc(self, document: DoclingDocument) -> Doc: inputs = [] pages = { - (page.page_no + 1): PageLayout( - page_no=page.page_no + 1, + (page.page_no): PageLayout( + page_no=page.page_no, width=page.size.width if page.size else 0, height=page.size.height if page.size else 0, ) - for page in result.pages + for _, page in document.pages.items() } - text_items = {item.self_ref: item for item in result.document.texts} - table_items = {item.self_ref: item for item in result.document.tables} + text_items = {item.self_ref: item for item in document.texts} + table_items = {item.self_ref: item for item in document.tables} # We want to iterate over the tree to get different elements in order - for node, _ in result.document.iterate_items(): + for node, _ in document.iterate_items(): if node.self_ref in text_items: item = text_items[node.self_ref] text = item.text @@ -116,7 +162,7 @@ def _result_to_doc(self, result: "ConversionResult") -> Doc: inputs.append((table_text, item)) doc = self._texts_to_doc(inputs, pages) doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()])) - doc._.set(self.attrs.doc_markdown, result.document.export_to_markdown()) + doc._.set(self.attrs.doc_markdown, document.export_to_markdown()) return doc def _texts_to_doc( @@ -147,7 +193,7 @@ def _texts_to_doc( span = Span(doc, start=start, end=end, label=item.label, span_id=i) layout = self._get_span_layout(item, pages) span._.set(self.attrs.span_layout, layout) - if item.label == DocItemLabel.TABLE: + if item.label in TABLE_ITEM_LABELS: span._.set(self.attrs.span_data, item.export_to_dataframe()) spans.append(span) doc.spans[self.attrs.span_group] = SpanGroup( @@ -191,5 +237,5 @@ def get_tables(self, doc: Doc) -> list[Span]: return [ span for span in doc.spans[self.attrs.span_group] - if span.label_ == DocItemLabel.TABLE + if span.label_ in TABLE_ITEM_LABELS ] diff --git a/tests/data/table_document_index.pdf b/tests/data/table_document_index.pdf new file mode 100644 index 0000000..cdfa135 Binary files /dev/null and b/tests/data/table_document_index.pdf differ diff --git a/tests/test_general.py b/tests/test_general.py index 9813c8b..36d6c6b 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -8,6 +8,7 @@ from pandas import DataFrame from pandas.testing import assert_frame_equal from spacy.tokens import DocBin +import pandas as pd from spacy_layout import spaCyLayout from spacy_layout.layout import TABLE_PLACEHOLDER, get_bounding_box @@ -18,6 +19,7 @@ DOCX_SIMPLE = Path(__file__).parent / "data" / "simple.docx" PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read() PDF_TABLE = Path(__file__).parent / "data" / "table.pdf" +PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf" @pytest.fixture @@ -41,6 +43,21 @@ def test_general(path, nlp, span_labels): assert isinstance(span._.get(layout.attrs.span_layout), SpanLayout) +@pytest.mark.parametrize("path, pg_no", [(PDF_STARCRAFT, 6), (PDF_SIMPLE, 1)]) +def test_pages(path, pg_no, nlp): + layout = spaCyLayout(nlp) + doc = layout(path) + # This should not raise a KeyError when accessing `pages` dict + # Key Error would mean a mismatched pagination on document layout and span layout + result = layout.get_pages(doc) + assert len(result) == pg_no + assert result[0][0].page_no == 1 + if pg_no == 6: # there should be 16 or 18 spans on the pg_no 1 + assert len(result[0][1]) in (16, 18) + elif pg_no == 1: # there should be 4 spans on pg_no 1 + assert len(result[0][1]) == 4 + + @pytest.mark.parametrize("path", [PDF_SIMPLE, DOCX_SIMPLE]) @pytest.mark.parametrize("separator", ["\n\n", ""]) def test_simple(path, separator, nlp): @@ -66,6 +83,15 @@ def fix_text(text): assert doc.text.startswith("LOREM ipsum dolor sit amet") +def test_simple_pipe_as_tuples(nlp): + layout = spaCyLayout(nlp) + data = [(PDF_SIMPLE, "pdf"), (DOCX_SIMPLE, "docx")] + result = list(layout.pipe(data, as_tuples=True)) + for doc, _ in result: + assert len(doc.spans[layout.attrs.span_group]) == 4 + assert [context for _, context in result] == ["pdf", "docx"] + + def test_table(nlp): layout = spaCyLayout(nlp) doc = layout(PDF_TABLE) @@ -95,6 +121,23 @@ def test_table(nlp): assert markdown in doc._.get(layout.attrs.doc_markdown) +def test_table_index(nlp): + layout = spaCyLayout(nlp) + doc = layout(PDF_INDEX) + assert len(doc._.get(layout.attrs.doc_tables)) == 3 + table = doc._.get(layout.attrs.doc_tables)[0] + assert table.text == TABLE_PLACEHOLDER + assert table.label_ == DocItemLabel.DOCUMENT_INDEX.value + + # Check that each document_index table has a dataframe + document_index_tables = [span for span in doc._.get( + layout.attrs.doc_tables) if span.label_ == DocItemLabel.DOCUMENT_INDEX.value] + for table in document_index_tables: + assert table._.data is not None, "Table data not available" + assert isinstance( + table._.data, pd.DataFrame), "Table data is not a DataFrame" + + def test_table_placeholder(nlp): def display_table(df): return f"Table with columns: {', '.join(df.columns.tolist())}"