Fix pages getting shuffled by visualize()

Belval · Belval · commit 17e4f57bfd22 · 2024-02-23T23:44:29.000Z
diff --git a/textractor/data/text_linearization_config.py b/textractor/data/text_linearization_config.py
@@ -68,6 +68,10 @@ class TextLinearizationConfig:
 
     table_tabulate_remove_extra_hyphens: bool = False  #: By default markdown tables will have N hyphens to preserve alignement, this reduces the number of hyphens to 1, which is the minimum number allowed by the GitHub Markdown spec
 
+    table_duplicate_text_in_merged_cells: bool = False #: Duplicate text in merged cells to preserve line alignment
+
+    table_flatten_headers: bool = False #: Flatten table headers into a single row, unmerging the cells horizontally
+
     table_min_table_words: int = 0  #: Threshold below which tables will be rendered as words instead of using table layout
 
     table_column_separator: str = "\t"  #: Table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature
@@ -147,7 +151,3 @@ class TextLinearizationConfig:
     add_prefixes_and_suffixes_as_words: bool = False #: Controls if the prefixes/suffixes will be inserted in the words returned by `get_text_and_words`
 
     add_prefixes_and_suffixes_in_text: bool = True #: Controls if the prefixes/suffixes will be added to the linearized text
-
-    duplicate_text_in_merged_cells: bool = False #: Duplicate text in merged cells to preserve line alignment
-
-    table_flatten_headers: bool = False #: Flatten table headers into a single row, unmerging the cells horizontally
diff --git a/textractor/entities/table.py b/textractor/entities/table.py
@@ -527,7 +527,7 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
                 for i, cell in enumerate(row):
                     if (
                         cell not in processed_cells or
-                        config.duplicate_text_in_merged_cells or
+                        config.table_duplicate_text_in_merged_cells or
                         config.table_flatten_headers
                     ):
                         if cell.siblings:
@@ -581,7 +581,7 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
                 if cell.siblings:
                     children = []
                     first_row, first_col, last_row, last_col = cell._get_merged_cell_range()
-                    if (cell.col_index == first_col and cell.row_index == first_row) or config.duplicate_text_in_merged_cells:
+                    if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
                         for sib in cell.siblings:
                             children.extend(sib.children)
                             processed_cells.add(sib)
@@ -698,7 +698,7 @@ def get_text_and_words(
                 for i, cell in enumerate(row):
                     if (
                         cell not in processed_cells or
-                        config.duplicate_text_in_merged_cells or
+                        config.table_duplicate_text_in_merged_cells or
                         config.table_flatten_headers
                     ):
                         if cell.siblings:
@@ -763,7 +763,7 @@ def get_text_and_words(
                     row_index = first_row
                     row_span = last_row - first_row + 1
                     children = []
-                    if (cell.col_index == first_col and cell.row_index == first_row) or config.duplicate_text_in_merged_cells:
+                    if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
                         for sib in cell.siblings:
                             children.extend(sib.children)
                             processed_cells.add(sib)
@@ -887,7 +887,7 @@ def get_text_and_words(
                     for i, cell in enumerate(row):
                         if (
                             cell not in processed_cells or
-                            config.duplicate_text_in_merged_cells or
+                            config.table_duplicate_text_in_merged_cells or
                             config.table_flatten_headers
                         ):
                             if cell.siblings:
@@ -932,7 +932,7 @@ def get_text_and_words(
                     # Siblings includes the current cell
                     if cell.siblings:
                         first_row, first_col, last_row, last_col = cell._get_merged_cell_range()
-                        if (cell.col_index == first_col and cell.row_index == first_row) or config.duplicate_text_in_merged_cells:
+                        if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
                             for sib in cell.siblings:
                                 children.extend(sib.children)
                                 processed_cells.add(sib)
diff --git a/textractor/visualizers/entitylist.py b/textractor/visualizers/entitylist.py
@@ -105,7 +105,7 @@ def visualize(
                     self._add_expense_document_to_list(new_entity_list, entity)
                 else:
                     new_entity_list.append(entity)
-            return EntityList(list(set(new_entity_list))).visualize(
+            return EntityList(list(dict.fromkeys(new_entity_list).keys())).visualize(
                 with_text=with_text,
                 with_words=with_words,
                 with_confidence=with_confidence,
@@ -131,7 +131,7 @@ def visualize(
 
         for page in list(entities_pagewise.keys()):
             # Deduplication
-            entities_pagewise[page] = list(set(entities_pagewise[page]))
+            entities_pagewise[page] = list(dict.fromkeys(entities_pagewise[page]).keys())
 
         for page in entities_pagewise.keys():
             visualized_images[page] = _draw_bbox(
@@ -488,12 +488,12 @@ def __add__(self, list2):
         return EntityList([*self, *list2])
 
     def get_text_and_words(self, config: TextLinearizationConfig = TextLinearizationConfig()):
-        text, words = "", []
+        texts, words = [], []
         for entity in self:
             entity_text, entity_words = entity.get_text_and_words(config)
-            text += entity_text
+            texts.append(entity_text)
             words.extend(entity_words)
-        return text, words
+        return config.layout_element_separator.join(texts), words
 
 def _convert_form_to_list(
     form_objects,