aws-samples
diff --git a/‎docs/source/examples.rst
Lines changed: 2 additions & 0 deletions b/‎docs/source/examples.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/notebooks/document_linearization_to_markdown_or_html.ipynb
Lines changed: 641 additions & 0 deletions b/‎docs/source/notebooks/document_linearization_to_markdown_or_html.ipynb
Lines changed: 641 additions & 0 deletions
diff --git a/‎docs/source/notebooks/tabular_data_linearization_continued.ipynb
Lines changed: 465 additions & 0 deletions b/‎docs/source/notebooks/tabular_data_linearization_continued.ipynb
Lines changed: 465 additions & 0 deletions
diff --git a/‎tests/fixtures/vbat.png
43.6 KB b/‎tests/fixtures/vbat.png
43.6 KB
diff --git a/‎tests/fixtures/vbat2.png
55.1 KB b/‎tests/fixtures/vbat2.png
55.1 KB
diff --git a/‎textractor/data/html_linearization_config.py
Lines changed: 41 additions & 0 deletions b/‎textractor/data/html_linearization_config.py
Lines changed: 41 additions & 0 deletions
diff --git a/‎textractor/data/markdown_linearization_config.py
Lines changed: 19 additions & 0 deletions b/‎textractor/data/markdown_linearization_config.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎textractor/data/text_linearization_config.py
Lines changed: 18 additions & 2 deletions b/‎textractor/data/text_linearization_config.py
Lines changed: 18 additions & 2 deletions
diff --git a/‎textractor/entities/document.py
Lines changed: 3 additions & 9 deletions b/‎textractor/entities/document.py
Lines changed: 3 additions & 9 deletions
diff --git a/‎textractor/entities/document_entity.py
Lines changed: 4 additions & 13 deletions b/‎textractor/entities/document_entity.py
Lines changed: 4 additions & 13 deletions
@@ -16,7 +16,9 @@ Examples
    notebooks/using_queries
    notebooks/layout_analysis
    notebooks/tabular_data_linearization
+   notebooks/tabular_data_linearization_continued
    notebooks/layout_analysis_for_text_linearization
+   notebooks/document_linearization_to_markdown_or_html
    notebooks/textractor_for_large_language_models
    notebooks/interfacing_with_trp2
    notebooks/signature_detection
 
@@ -0,0 +1,41 @@
+import os
+from dataclasses import dataclass
+
+from textractor.data.text_linearization_config import TextLinearizationConfig
+
+@dataclass
+class HTMLLinearizationConfig(TextLinearizationConfig):
+    """
+    This :class:`HTMLLinearizationConfig` is a convenience configuration for converting a Document or DocumentEntity to HTML.
+    For a description of the parameters see :class:`TextLinearizationConfig`.
+    """
+
+    title_prefix: str = "<h1>"
+
+    title_suffix: str = "</h1>"
+
+    section_header_prefix: str = "<h2>"
+
+    section_header_suffix: str = "</h2>"
+
+    text_prefix: str = "<p>"
+
+    text_suffix: str = "</p>"
+
+    table_prefix: str = "<table>"
+
+    table_suffix: str = "</table>"
+
+    table_row_prefix: str = "<tr>"
+
+    table_row_suffix: str = "</tr>"
+
+    table_cell_header_prefix: str = "<th>"
+
+    table_cell_header_suffix: str = "</th>"
+
+    table_cell_prefix: str = "<td>"
+
+    table_cell_suffix: str = "</td>"
+
+    table_column_separator: str = ""
@@ -0,0 +1,19 @@
+import os
+from dataclasses import dataclass
+
+from textractor.data.text_linearization_config import TextLinearizationConfig
+
+@dataclass
+class MarkdownLinearizationConfig(TextLinearizationConfig):
+    """
+    This :class:`MarkdownLinearizationConfig` is a convenience configuration for converting a Document or DocumentEntity to Markdown.
+    For a description of the parameters see :class:`TextLinearizationConfig`.
+    """
+
+    title_prefix: str = "# "
+
+    table_linearization_format: str = "markdown"
+
+    section_header_prefix: str = "## "
+
+    table_remove_column_headers: bool = True
@@ -12,6 +12,8 @@ class TextLinearizationConfig:
 
     max_number_of_consecutive_new_lines: int = 2  #: Removes extra whitespace
 
+    max_number_of_consecutive_spaces: int = None  #: Removes extra whitespace (None skips whitespace removal)
+
     hide_header_layout: bool = False  #: Hide headers in the linearized output
 
     hide_footer_layout: bool = False  #: Hide footers in the linearized output
@@ -56,14 +58,20 @@ class TextLinearizationConfig:
 
     table_layout_suffix: str = os.linesep  #: Suffix for table elements
 
-    table_remove_column_headers: bool = False  #: Remove column headers from tables
+    table_remove_column_headers: bool = False  #: Remove pandas index column headers from tables
 
     table_column_header_threshold: float = 0.9 #: Threshold for a row to be selected as header when rendering as markdown. 0.9 means that 90% of the cells must have the is_header_cell flag. 
 
-    table_linearization_format: str = "plaintext"  #: How to represent tables in the linearized output. Choices are plaintext or markdown.
+    table_linearization_format: str = "plaintext"  #: How to represent tables in the linearized output. Choices are plaintext, markdown or HTML.
 
     table_tabulate_format: str = "github"  #: Markdown tabulate format to use when table are linearized as markdown
 
+    table_tabulate_remove_extra_hyphens: bool = False  #: By default markdown tables will have N hyphens to preserve alignement, this reduces the number of hyphens to 1, which is the minimum number allowed by the GitHub Markdown spec
+
+    table_duplicate_text_in_merged_cells: bool = False #: Duplicate text in merged cells to preserve line alignment
+
+    table_flatten_headers: bool = False #: Flatten table headers into a single row, unmerging the cells horizontally
+
     table_min_table_words: int = 0  #: Threshold below which tables will be rendered as words instead of using table layout
 
     table_column_separator: str = "\t"  #: Table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature
@@ -88,6 +96,14 @@ class TextLinearizationConfig:
 
     table_cell_empty_cell_placeholder: str = "" #: Placeholder for empty cells
 
+    table_cell_merge_cell_placeholder: str = "" #: Placeholder for merged cell
+
+    table_cell_left_merge_cell_placeholder: str = "" #: Placeholder for left merge cell (L) see: 
+
+    table_cell_top_merge_cell_placeholder: str = "" #: Placeholder for left merge cell (T)
+
+    table_cell_cross_merge_cell_placeholder: str = "" #: Placeholder for left merge cell (X)
+
     header_prefix: str = ""  #: Prefix for header layout elements
 
     header_suffix: str = ""  #: Suffix for header layout elements
 
@@ -24,7 +24,6 @@
 from textractor.exceptions import InputError
 from textractor.entities.key_value import KeyValue
 from textractor.entities.bbox import SpatialObject
-from textractor.data.constants import SelectionStatus
 from textractor.utils.s3_utils import download_from_s3
 from textractor.visualizers.entitylist import EntityList
 from textractor.data.constants import (
@@ -33,12 +32,12 @@
     Direction,
     DirectionalFinderType,
 )
-from textractor.entities.selection_element import SelectionElement
-from textractor.utils.search_utils import SearchUtils, jaccard_similarity
+from textractor.utils.search_utils import SearchUtils
 from textractor.data.text_linearization_config import TextLinearizationConfig
+from textractor.entities.linearizable import Linearizable
 
 
-class Document(SpatialObject):
+class Document(SpatialObject, Linearizable):
     """
     Represents the description of a single document, as it would appear in the input to the Textract API.
     Document serves as the root node of the object model hierarchy,
@@ -244,11 +243,6 @@ def pages(self, pages: List[Page]):
         """
         self._pages = sorted(pages, key=lambda x: x.page_num)
 
-    def get_text(
-        self, config: TextLinearizationConfig = TextLinearizationConfig()
-    ) -> str:
-        return self.get_text_and_words(config)[0]
-
     def get_text_and_words(
         self, config: TextLinearizationConfig = TextLinearizationConfig()
     ) -> Tuple[str, List]:
 
@@ -6,9 +6,11 @@
 from textractor.entities.bbox import BoundingBox
 from textractor.visualizers.entitylist import EntityList
 from textractor.data.text_linearization_config import TextLinearizationConfig
+from textractor.data.html_linearization_config import HTMLLinearizationConfig
+from textractor.data.markdown_linearization_config import MarkdownLinearizationConfig
+from textractor.entities.linearizable import Linearizable
 
-
-class DocumentEntity(ABC):
+class DocumentEntity(Linearizable, ABC):
     """
     An interface for all document entities within the document body, composing the
     hierarchy of the document object model.
@@ -193,14 +195,3 @@ def visualize(self, *args, **kwargs) -> EntityList:
         """
         return EntityList(self).visualize(*args, **kwargs)
 
-    @abstractmethod
-    def get_text_and_words(
-        self, config: TextLinearizationConfig = TextLinearizationConfig()
-    ) -> Tuple[str, List]:
-        """
-        Used for linearization, returns the linearized text of the entity and the matching words
-
-        :return: Tuple of text and word list
-        :rtype: Tuple[str, List[Word]]
-        """
-        pass