Skip to content

Commit 2b3b289

Browse files
authored
Table linearization improvements
2 parents 5ea39f8 + 17e4f57 commit 2b3b289

17 files changed

+1526
-124
lines changed

docs/source/examples.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ Examples
1616
notebooks/using_queries
1717
notebooks/layout_analysis
1818
notebooks/tabular_data_linearization
19+
notebooks/tabular_data_linearization_continued
1920
notebooks/layout_analysis_for_text_linearization
21+
notebooks/document_linearization_to_markdown_or_html
2022
notebooks/textractor_for_large_language_models
2123
notebooks/interfacing_with_trp2
2224
notebooks/signature_detection

docs/source/notebooks/document_linearization_to_markdown_or_html.ipynb

Lines changed: 641 additions & 0 deletions
Large diffs are not rendered by default.

docs/source/notebooks/tabular_data_linearization_continued.ipynb

Lines changed: 465 additions & 0 deletions
Large diffs are not rendered by default.

tests/fixtures/vbat.png

43.6 KB
Loading

tests/fixtures/vbat2.png

55.1 KB
Loading
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import os
2+
from dataclasses import dataclass
3+
4+
from textractor.data.text_linearization_config import TextLinearizationConfig
5+
6+
@dataclass
7+
class HTMLLinearizationConfig(TextLinearizationConfig):
8+
"""
9+
This :class:`HTMLLinearizationConfig` is a convenience configuration for converting a Document or DocumentEntity to HTML.
10+
For a description of the parameters see :class:`TextLinearizationConfig`.
11+
"""
12+
13+
title_prefix: str = "<h1>"
14+
15+
title_suffix: str = "</h1>"
16+
17+
section_header_prefix: str = "<h2>"
18+
19+
section_header_suffix: str = "</h2>"
20+
21+
text_prefix: str = "<p>"
22+
23+
text_suffix: str = "</p>"
24+
25+
table_prefix: str = "<table>"
26+
27+
table_suffix: str = "</table>"
28+
29+
table_row_prefix: str = "<tr>"
30+
31+
table_row_suffix: str = "</tr>"
32+
33+
table_cell_header_prefix: str = "<th>"
34+
35+
table_cell_header_suffix: str = "</th>"
36+
37+
table_cell_prefix: str = "<td>"
38+
39+
table_cell_suffix: str = "</td>"
40+
41+
table_column_separator: str = ""
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import os
2+
from dataclasses import dataclass
3+
4+
from textractor.data.text_linearization_config import TextLinearizationConfig
5+
6+
@dataclass
7+
class MarkdownLinearizationConfig(TextLinearizationConfig):
8+
"""
9+
This :class:`MarkdownLinearizationConfig` is a convenience configuration for converting a Document or DocumentEntity to Markdown.
10+
For a description of the parameters see :class:`TextLinearizationConfig`.
11+
"""
12+
13+
title_prefix: str = "# "
14+
15+
table_linearization_format: str = "markdown"
16+
17+
section_header_prefix: str = "## "
18+
19+
table_remove_column_headers: bool = True

textractor/data/text_linearization_config.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ class TextLinearizationConfig:
1212

1313
max_number_of_consecutive_new_lines: int = 2 #: Removes extra whitespace
1414

15+
max_number_of_consecutive_spaces: int = None #: Removes extra whitespace (None skips whitespace removal)
16+
1517
hide_header_layout: bool = False #: Hide headers in the linearized output
1618

1719
hide_footer_layout: bool = False #: Hide footers in the linearized output
@@ -56,14 +58,20 @@ class TextLinearizationConfig:
5658

5759
table_layout_suffix: str = os.linesep #: Suffix for table elements
5860

59-
table_remove_column_headers: bool = False #: Remove column headers from tables
61+
table_remove_column_headers: bool = False #: Remove pandas index column headers from tables
6062

6163
table_column_header_threshold: float = 0.9 #: Threshold for a row to be selected as header when rendering as markdown. 0.9 means that 90% of the cells must have the is_header_cell flag.
6264

63-
table_linearization_format: str = "plaintext" #: How to represent tables in the linearized output. Choices are plaintext or markdown.
65+
table_linearization_format: str = "plaintext" #: How to represent tables in the linearized output. Choices are plaintext, markdown or HTML.
6466

6567
table_tabulate_format: str = "github" #: Markdown tabulate format to use when table are linearized as markdown
6668

69+
table_tabulate_remove_extra_hyphens: bool = False #: By default markdown tables will have N hyphens to preserve alignement, this reduces the number of hyphens to 1, which is the minimum number allowed by the GitHub Markdown spec
70+
71+
table_duplicate_text_in_merged_cells: bool = False #: Duplicate text in merged cells to preserve line alignment
72+
73+
table_flatten_headers: bool = False #: Flatten table headers into a single row, unmerging the cells horizontally
74+
6775
table_min_table_words: int = 0 #: Threshold below which tables will be rendered as words instead of using table layout
6876

6977
table_column_separator: str = "\t" #: Table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature
@@ -88,6 +96,14 @@ class TextLinearizationConfig:
8896

8997
table_cell_empty_cell_placeholder: str = "" #: Placeholder for empty cells
9098

99+
table_cell_merge_cell_placeholder: str = "" #: Placeholder for merged cell
100+
101+
table_cell_left_merge_cell_placeholder: str = "" #: Placeholder for left merge cell (L) see:
102+
103+
table_cell_top_merge_cell_placeholder: str = "" #: Placeholder for left merge cell (T)
104+
105+
table_cell_cross_merge_cell_placeholder: str = "" #: Placeholder for left merge cell (X)
106+
91107
header_prefix: str = "" #: Prefix for header layout elements
92108

93109
header_suffix: str = "" #: Suffix for header layout elements

textractor/entities/document.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
from textractor.exceptions import InputError
2525
from textractor.entities.key_value import KeyValue
2626
from textractor.entities.bbox import SpatialObject
27-
from textractor.data.constants import SelectionStatus
2827
from textractor.utils.s3_utils import download_from_s3
2928
from textractor.visualizers.entitylist import EntityList
3029
from textractor.data.constants import (
@@ -33,12 +32,12 @@
3332
Direction,
3433
DirectionalFinderType,
3534
)
36-
from textractor.entities.selection_element import SelectionElement
37-
from textractor.utils.search_utils import SearchUtils, jaccard_similarity
35+
from textractor.utils.search_utils import SearchUtils
3836
from textractor.data.text_linearization_config import TextLinearizationConfig
37+
from textractor.entities.linearizable import Linearizable
3938

4039

41-
class Document(SpatialObject):
40+
class Document(SpatialObject, Linearizable):
4241
"""
4342
Represents the description of a single document, as it would appear in the input to the Textract API.
4443
Document serves as the root node of the object model hierarchy,
@@ -244,11 +243,6 @@ def pages(self, pages: List[Page]):
244243
"""
245244
self._pages = sorted(pages, key=lambda x: x.page_num)
246245

247-
def get_text(
248-
self, config: TextLinearizationConfig = TextLinearizationConfig()
249-
) -> str:
250-
return self.get_text_and_words(config)[0]
251-
252246
def get_text_and_words(
253247
self, config: TextLinearizationConfig = TextLinearizationConfig()
254248
) -> Tuple[str, List]:

textractor/entities/document_entity.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
from textractor.entities.bbox import BoundingBox
77
from textractor.visualizers.entitylist import EntityList
88
from textractor.data.text_linearization_config import TextLinearizationConfig
9+
from textractor.data.html_linearization_config import HTMLLinearizationConfig
10+
from textractor.data.markdown_linearization_config import MarkdownLinearizationConfig
11+
from textractor.entities.linearizable import Linearizable
912

10-
11-
class DocumentEntity(ABC):
13+
class DocumentEntity(Linearizable, ABC):
1214
"""
1315
An interface for all document entities within the document body, composing the
1416
hierarchy of the document object model.
@@ -193,14 +195,3 @@ def visualize(self, *args, **kwargs) -> EntityList:
193195
"""
194196
return EntityList(self).visualize(*args, **kwargs)
195197

196-
@abstractmethod
197-
def get_text_and_words(
198-
self, config: TextLinearizationConfig = TextLinearizationConfig()
199-
) -> Tuple[str, List]:
200-
"""
201-
Used for linearization, returns the linearized text of the entity and the matching words
202-
203-
:return: Tuple of text and word list
204-
:rtype: Tuple[str, List[Word]]
205-
"""
206-
pass

0 commit comments

Comments
 (0)