Skip to content

Commit 17e4f57

Browse files
committed
Fix pages getting shuffled by visualize()
1 parent e9c13d5 commit 17e4f57

File tree

3 files changed

+15
-15
lines changed

3 files changed

+15
-15
lines changed

textractor/data/text_linearization_config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ class TextLinearizationConfig:
6868

6969
table_tabulate_remove_extra_hyphens: bool = False #: By default markdown tables will have N hyphens to preserve alignement, this reduces the number of hyphens to 1, which is the minimum number allowed by the GitHub Markdown spec
7070

71+
table_duplicate_text_in_merged_cells: bool = False #: Duplicate text in merged cells to preserve line alignment
72+
73+
table_flatten_headers: bool = False #: Flatten table headers into a single row, unmerging the cells horizontally
74+
7175
table_min_table_words: int = 0 #: Threshold below which tables will be rendered as words instead of using table layout
7276

7377
table_column_separator: str = "\t" #: Table column separator, used when linearizing layout tables, not used if AnalyzeDocument was called with the TABLES feature
@@ -147,7 +151,3 @@ class TextLinearizationConfig:
147151
add_prefixes_and_suffixes_as_words: bool = False #: Controls if the prefixes/suffixes will be inserted in the words returned by `get_text_and_words`
148152

149153
add_prefixes_and_suffixes_in_text: bool = True #: Controls if the prefixes/suffixes will be added to the linearized text
150-
151-
duplicate_text_in_merged_cells: bool = False #: Duplicate text in merged cells to preserve line alignment
152-
153-
table_flatten_headers: bool = False #: Flatten table headers into a single row, unmerging the cells horizontally

textractor/entities/table.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
527527
for i, cell in enumerate(row):
528528
if (
529529
cell not in processed_cells or
530-
config.duplicate_text_in_merged_cells or
530+
config.table_duplicate_text_in_merged_cells or
531531
config.table_flatten_headers
532532
):
533533
if cell.siblings:
@@ -581,7 +581,7 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
581581
if cell.siblings:
582582
children = []
583583
first_row, first_col, last_row, last_col = cell._get_merged_cell_range()
584-
if (cell.col_index == first_col and cell.row_index == first_row) or config.duplicate_text_in_merged_cells:
584+
if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
585585
for sib in cell.siblings:
586586
children.extend(sib.children)
587587
processed_cells.add(sib)
@@ -698,7 +698,7 @@ def get_text_and_words(
698698
for i, cell in enumerate(row):
699699
if (
700700
cell not in processed_cells or
701-
config.duplicate_text_in_merged_cells or
701+
config.table_duplicate_text_in_merged_cells or
702702
config.table_flatten_headers
703703
):
704704
if cell.siblings:
@@ -763,7 +763,7 @@ def get_text_and_words(
763763
row_index = first_row
764764
row_span = last_row - first_row + 1
765765
children = []
766-
if (cell.col_index == first_col and cell.row_index == first_row) or config.duplicate_text_in_merged_cells:
766+
if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
767767
for sib in cell.siblings:
768768
children.extend(sib.children)
769769
processed_cells.add(sib)
@@ -887,7 +887,7 @@ def get_text_and_words(
887887
for i, cell in enumerate(row):
888888
if (
889889
cell not in processed_cells or
890-
config.duplicate_text_in_merged_cells or
890+
config.table_duplicate_text_in_merged_cells or
891891
config.table_flatten_headers
892892
):
893893
if cell.siblings:
@@ -932,7 +932,7 @@ def get_text_and_words(
932932
# Siblings includes the current cell
933933
if cell.siblings:
934934
first_row, first_col, last_row, last_col = cell._get_merged_cell_range()
935-
if (cell.col_index == first_col and cell.row_index == first_row) or config.duplicate_text_in_merged_cells:
935+
if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
936936
for sib in cell.siblings:
937937
children.extend(sib.children)
938938
processed_cells.add(sib)

textractor/visualizers/entitylist.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def visualize(
105105
self._add_expense_document_to_list(new_entity_list, entity)
106106
else:
107107
new_entity_list.append(entity)
108-
return EntityList(list(set(new_entity_list))).visualize(
108+
return EntityList(list(dict.fromkeys(new_entity_list).keys())).visualize(
109109
with_text=with_text,
110110
with_words=with_words,
111111
with_confidence=with_confidence,
@@ -131,7 +131,7 @@ def visualize(
131131

132132
for page in list(entities_pagewise.keys()):
133133
# Deduplication
134-
entities_pagewise[page] = list(set(entities_pagewise[page]))
134+
entities_pagewise[page] = list(dict.fromkeys(entities_pagewise[page]).keys())
135135

136136
for page in entities_pagewise.keys():
137137
visualized_images[page] = _draw_bbox(
@@ -488,12 +488,12 @@ def __add__(self, list2):
488488
return EntityList([*self, *list2])
489489

490490
def get_text_and_words(self, config: TextLinearizationConfig = TextLinearizationConfig()):
491-
text, words = "", []
491+
texts, words = [], []
492492
for entity in self:
493493
entity_text, entity_words = entity.get_text_and_words(config)
494-
text += entity_text
494+
texts.append(entity_text)
495495
words.extend(entity_words)
496-
return text, words
496+
return config.layout_element_separator.join(texts), words
497497

498498
def _convert_form_to_list(
499499
form_objects,

0 commit comments

Comments
 (0)