Skip to content

Commit 2d06c1f

Browse files
authored
Fix .to_markdown() raising an exception on missing local config
2 parents 32b5d76 + b209eae commit 2d06c1f

File tree

1 file changed

+47
-47
lines changed

1 file changed

+47
-47
lines changed

textractor/entities/table.py

Lines changed: 47 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -674,23 +674,24 @@ def to_html(self) -> str:
674674
def get_text_and_words(
675675
self, config: TextLinearizationConfig = TextLinearizationConfig()
676676
):
677+
local_config = deepcopy(config)
677678
words_ = self.words
678679
# If no text, return empty string
679-
if not words_ and config.table_remove_column_headers:
680+
if not words_ and local_config.table_remove_column_headers:
680681
return "", []
681682

682683
# If not many words, only return text
683-
if len(words_) < config.table_min_table_words:
684+
if len(words_) < local_config.table_min_table_words:
684685
return linearize_children(words_, config=config)
685686

686-
words = [Word(str(uuid.uuid4()), self.bbox, config.table_prefix)] if config.table_prefix else []
687+
words = [Word(str(uuid.uuid4()), self.bbox, local_config.table_prefix)] if local_config.table_prefix else []
687688
rows = sorted([(key, list(group)) for key, group in itertools.groupby(
688689
self.table_cells, key=lambda cell: cell.row_index
689690
)], key=lambda r: r[0])
690691
processed_cells = set()
691692
# Fill the table
692693
row_offset = 0
693-
if config.table_flatten_headers:
694+
if local_config.table_flatten_headers:
694695
columns = [[] for _ in range(len(rows[0][1]))]
695696
columns_bbox = [[] for _ in range(len(rows[0][1]))]
696697
for _, row in rows:
@@ -700,8 +701,8 @@ def get_text_and_words(
700701
for i, cell in enumerate(row):
701702
if (
702703
cell not in processed_cells or
703-
config.table_duplicate_text_in_merged_cells or
704-
config.table_flatten_headers
704+
local_config.table_duplicate_text_in_merged_cells or
705+
local_config.table_flatten_headers
705706
):
706707
if cell.siblings:
707708
# This handles the edge case where we are flattening the headers
@@ -720,21 +721,21 @@ def get_text_and_words(
720721
_, words = cell.get_text_and_words(config)
721722
columns[i].extend(words)
722723
columns_bbox[i].append(cell.bbox)
723-
elif config.table_cell_empty_cell_placeholder:
724-
columns[i].append(Word(str(uuid.uuid4()), cell.bbox, config.table_cell_empty_cell_placeholder))
724+
elif local_config.table_cell_empty_cell_placeholder:
725+
columns[i].append(Word(str(uuid.uuid4()), cell.bbox, local_config.table_cell_empty_cell_placeholder))
725726
row_offset += 1
726727
if columns:
727728
columns_bbox = [BoundingBox.enclosing_bbox(cbb) for cbb in columns_bbox]
728-
if config.table_row_prefix and config.add_prefixes_and_suffixes_as_words:
729-
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(columns_bbox), config.table_row_prefix, is_structure=True))
729+
if local_config.table_row_prefix and local_config.add_prefixes_and_suffixes_as_words:
730+
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(columns_bbox), local_config.table_row_prefix, is_structure=True))
730731
for i, column in enumerate(columns):
731732
words.append(
732733
Word(
733734
str(uuid.uuid4()),
734735
columns_bbox[i],
735-
config.table_cell_header_prefix
736-
if config.table_cell_header_prefix
737-
else config.table_cell_prefix,
736+
local_config.table_cell_header_prefix
737+
if local_config.table_cell_header_prefix
738+
else local_config.table_cell_prefix,
738739
is_structure=True
739740
)
740741
)
@@ -743,17 +744,17 @@ def get_text_and_words(
743744
Word(
744745
str(uuid.uuid4()),
745746
columns_bbox[i],
746-
config.table_cell_header_suffix
747-
if config.table_cell_header_suffix
748-
else config.table_cell_suffix,
747+
local_config.table_cell_header_suffix
748+
if local_config.table_cell_header_suffix
749+
else local_config.table_cell_suffix,
749750
is_structure=True
750751
)
751752
)
752-
if config.table_row_suffix and config.add_prefixes_and_suffixes_as_words:
753-
words.append(Word(str(uuid.uuid4()), columns_bbox, config.table_row_suffix, is_structure=True))
753+
if local_config.table_row_suffix and local_config.add_prefixes_and_suffixes_as_words:
754+
words.append(Word(str(uuid.uuid4()), columns_bbox, local_config.table_row_suffix, is_structure=True))
754755
for _, cells in rows[row_offset:]:
755-
if config.table_row_prefix and config.add_prefixes_and_suffixes_as_words:
756-
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(cells), config.table_row_prefix, is_structure=True))
756+
if local_config.table_row_prefix and local_config.add_prefixes_and_suffixes_as_words:
757+
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(cells), local_config.table_row_prefix, is_structure=True))
757758
for cell in sorted(cells, key=lambda c: c.col_index):
758759
# Siblings includes the current cell
759760
if cell.siblings:
@@ -765,35 +766,35 @@ def get_text_and_words(
765766
row_index = first_row
766767
row_span = last_row - first_row + 1
767768
children = []
768-
if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
769+
if (cell.col_index == first_col and cell.row_index == first_row) or local_config.table_duplicate_text_in_merged_cells:
769770
for sib in cell.siblings:
770771
children.extend(sib.children)
771772
processed_cells.add(sib)
772773
_, cell_words = linearize_children(children, config=config, no_new_lines=True)
773-
elif cell.row_index == first_row and config.table_cell_left_merge_cell_placeholder:
774+
elif cell.row_index == first_row and local_config.table_cell_left_merge_cell_placeholder:
774775
# Left-merge token
775776
cell_words = [
776777
Word(str(uuid.uuid4()),
777778
cell_bbox,
778-
config.table_cell_left_merge_cell_placeholder,
779+
local_config.table_cell_left_merge_cell_placeholder,
779780
is_structure=True
780781
)
781782
]
782-
elif cell.col_index == first_col and config.table_cell_top_merge_cell_placeholder:
783+
elif cell.col_index == first_col and local_config.table_cell_top_merge_cell_placeholder:
783784
# Top-merge token
784785
cell_words = [
785786
Word(str(uuid.uuid4()),
786787
cell_bbox,
787-
config.table_cell_top_merge_cell_placeholder,
788+
local_config.table_cell_top_merge_cell_placeholder,
788789
is_structure=True
789790
)
790791
]
791-
elif cell.col_index != first_col and cell.row_index != first_row and config.table_cell_cross_merge_cell_placeholder:
792+
elif cell.col_index != first_col and cell.row_index != first_row and local_config.table_cell_cross_merge_cell_placeholder:
792793
# Cross-merge token (left and top)
793794
cell_words = [
794795
Word(str(uuid.uuid4()),
795796
cell_bbox,
796-
config.table_cell_cross_merge_cell_placeholder,
797+
local_config.table_cell_cross_merge_cell_placeholder,
797798
is_structure=True
798799
)
799800
]
@@ -807,15 +808,15 @@ def get_text_and_words(
807808
row_index = cell.row_index
808809
row_span = cell.row_span
809810
_, cell_words = cell.get_text_and_words(config)
810-
if config.add_prefixes_and_suffixes_as_words:
811-
if config.table_cell_prefix or (config.table_cell_header_prefix and cell.is_column_header):
811+
if local_config.add_prefixes_and_suffixes_as_words:
812+
if local_config.table_cell_prefix or (local_config.table_cell_header_prefix and cell.is_column_header):
812813
words.append(
813814
Word(
814815
str(uuid.uuid4()),
815816
cell_bbox,
816-
config.table_cell_header_prefix
817-
if cell.is_column_header and config.table_cell_header_prefix
818-
else config.table_cell_prefix,
817+
local_config.table_cell_header_prefix
818+
if cell.is_column_header and local_config.table_cell_header_prefix
819+
else local_config.table_cell_prefix,
819820
is_structure=True
820821
)
821822
)
@@ -827,15 +828,15 @@ def get_text_and_words(
827828
words[-1].row_span = row_span
828829

829830
words.extend(cell_words)
830-
if not cell_words and config.table_cell_empty_cell_placeholder:
831-
words.append(Word(str(uuid.uuid4()), cell_bbox, config.table_cell_empty_cell_placeholder))
831+
if not cell_words and local_config.table_cell_empty_cell_placeholder:
832+
words.append(Word(str(uuid.uuid4()), cell_bbox, local_config.table_cell_empty_cell_placeholder))
832833

833-
if config.table_cell_suffix or (config.table_cell_header_suffix and cell.is_column_header):
834+
if local_config.table_cell_suffix or (local_config.table_cell_header_suffix and cell.is_column_header):
834835
words.append(
835836
Word(
836837
str(uuid.uuid4()),
837838
cell_bbox,
838-
config.table_cell_header_suffix if cell.is_column_header and config.table_cell_header_suffix else config.table_cell_suffix,
839+
local_config.table_cell_header_suffix if cell.is_column_header and local_config.table_cell_header_suffix else local_config.table_cell_suffix,
839840
is_structure=True
840841
)
841842
)
@@ -847,38 +848,37 @@ def get_text_and_words(
847848
words[-1].row_span = row_span
848849
else:
849850
words.extend(cell_words)
850-
if config.table_row_suffix and config.add_prefixes_and_suffixes_as_words:
851-
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(cells), config.table_row_suffix, is_structure=True))
851+
if local_config.table_row_suffix and local_config.add_prefixes_and_suffixes_as_words:
852+
words.append(Word(str(uuid.uuid4()), BoundingBox.enclosing_bbox(cells), local_config.table_row_suffix, is_structure=True))
852853

853-
if config.table_suffix:
854-
words.append(Word(str(uuid.uuid4()), self.bbox, config.table_suffix))
854+
if local_config.table_suffix:
855+
words.append(Word(str(uuid.uuid4()), self.bbox, local_config.table_suffix))
855856

856857
for w in words:
857858
w.table_id = str(self.id)
858859
w.table_bbox = self.bbox
859860

860-
text = (config.table_prefix if config.add_prefixes_and_suffixes_in_text else "")
861+
text = (local_config.table_prefix if local_config.add_prefixes_and_suffixes_in_text else "")
861862
# Markdown
862-
if config.table_linearization_format == "markdown":
863+
if local_config.table_linearization_format == "markdown":
863864
df = self.to_pandas(
864865
use_columns=True,
865866
config=config
866867
)
867868
has_column = any([isinstance(c, str) for c in df.columns])
868-
if config.table_remove_column_headers:
869+
if local_config.table_remove_column_headers:
869870
headers = df.columns if has_column else ["" for c in df.columns]
870871
else:
871872
headers = df.columns
872873
table = df.to_markdown(
873-
tablefmt=config.table_tabulate_format, headers=headers, index=False
874+
tablefmt=local_config.table_tabulate_format, headers=headers, index=False
874875
)
875-
if config.table_tabulate_remove_extra_hyphens:
876+
if local_config.table_tabulate_remove_extra_hyphens:
876877
while "-" * 2 in table:
877878
table = table.replace("--", "-")
878879
text += table
879880
# Plaintext or HTML
880881
else:
881-
local_config = deepcopy(config)
882882
# FIXME: The cyclomatic complexity of doing things like this will be unsustainable.
883883
if local_config.table_flatten_semi_structured_as_plaintext and self.table_type == TableTypes.SEMI_STRUCTURED:
884884
text = "<p>"
@@ -1030,7 +1030,7 @@ def get_text_and_words(
10301030
text += (local_config.table_row_suffix if local_config.add_prefixes_and_suffixes_in_text else "")
10311031
text += local_config.table_row_separator
10321032

1033-
if local_config.table_add_title_as_caption and self.title:
1033+
if local_config.table_add_title_as_caption and self.title and local_config.table_linearization_format == "html":
10341034
text += "<caption>" + self.title.get_text() + "</caption>"
10351035

10361036
text += (local_config.table_suffix if local_config.add_prefixes_and_suffixes_in_text else "")

0 commit comments

Comments
 (0)