@@ -674,23 +674,24 @@ def to_html(self) -> str:
674
674
def get_text_and_words (
675
675
self , config : TextLinearizationConfig = TextLinearizationConfig ()
676
676
):
677
+ local_config = deepcopy (config )
677
678
words_ = self .words
678
679
# If no text, return empty string
679
- if not words_ and config .table_remove_column_headers :
680
+ if not words_ and local_config .table_remove_column_headers :
680
681
return "" , []
681
682
682
683
# If not many words, only return text
683
- if len (words_ ) < config .table_min_table_words :
684
+ if len (words_ ) < local_config .table_min_table_words :
684
685
return linearize_children (words_ , config = config )
685
686
686
- words = [Word (str (uuid .uuid4 ()), self .bbox , config .table_prefix )] if config .table_prefix else []
687
+ words = [Word (str (uuid .uuid4 ()), self .bbox , local_config .table_prefix )] if local_config .table_prefix else []
687
688
rows = sorted ([(key , list (group )) for key , group in itertools .groupby (
688
689
self .table_cells , key = lambda cell : cell .row_index
689
690
)], key = lambda r : r [0 ])
690
691
processed_cells = set ()
691
692
# Fill the table
692
693
row_offset = 0
693
- if config .table_flatten_headers :
694
+ if local_config .table_flatten_headers :
694
695
columns = [[] for _ in range (len (rows [0 ][1 ]))]
695
696
columns_bbox = [[] for _ in range (len (rows [0 ][1 ]))]
696
697
for _ , row in rows :
@@ -700,8 +701,8 @@ def get_text_and_words(
700
701
for i , cell in enumerate (row ):
701
702
if (
702
703
cell not in processed_cells or
703
- config .table_duplicate_text_in_merged_cells or
704
- config .table_flatten_headers
704
+ local_config .table_duplicate_text_in_merged_cells or
705
+ local_config .table_flatten_headers
705
706
):
706
707
if cell .siblings :
707
708
# This handles the edge case where we are flattening the headers
@@ -720,21 +721,21 @@ def get_text_and_words(
720
721
_ , words = cell .get_text_and_words (config )
721
722
columns [i ].extend (words )
722
723
columns_bbox [i ].append (cell .bbox )
723
- elif config .table_cell_empty_cell_placeholder :
724
- columns [i ].append (Word (str (uuid .uuid4 ()), cell .bbox , config .table_cell_empty_cell_placeholder ))
724
+ elif local_config .table_cell_empty_cell_placeholder :
725
+ columns [i ].append (Word (str (uuid .uuid4 ()), cell .bbox , local_config .table_cell_empty_cell_placeholder ))
725
726
row_offset += 1
726
727
if columns :
727
728
columns_bbox = [BoundingBox .enclosing_bbox (cbb ) for cbb in columns_bbox ]
728
- if config .table_row_prefix and config .add_prefixes_and_suffixes_as_words :
729
- words .append (Word (str (uuid .uuid4 ()), BoundingBox .enclosing_bbox (columns_bbox ), config .table_row_prefix , is_structure = True ))
729
+ if local_config .table_row_prefix and local_config .add_prefixes_and_suffixes_as_words :
730
+ words .append (Word (str (uuid .uuid4 ()), BoundingBox .enclosing_bbox (columns_bbox ), local_config .table_row_prefix , is_structure = True ))
730
731
for i , column in enumerate (columns ):
731
732
words .append (
732
733
Word (
733
734
str (uuid .uuid4 ()),
734
735
columns_bbox [i ],
735
- config .table_cell_header_prefix
736
- if config .table_cell_header_prefix
737
- else config .table_cell_prefix ,
736
+ local_config .table_cell_header_prefix
737
+ if local_config .table_cell_header_prefix
738
+ else local_config .table_cell_prefix ,
738
739
is_structure = True
739
740
)
740
741
)
@@ -743,17 +744,17 @@ def get_text_and_words(
743
744
Word (
744
745
str (uuid .uuid4 ()),
745
746
columns_bbox [i ],
746
- config .table_cell_header_suffix
747
- if config .table_cell_header_suffix
748
- else config .table_cell_suffix ,
747
+ local_config .table_cell_header_suffix
748
+ if local_config .table_cell_header_suffix
749
+ else local_config .table_cell_suffix ,
749
750
is_structure = True
750
751
)
751
752
)
752
- if config .table_row_suffix and config .add_prefixes_and_suffixes_as_words :
753
- words .append (Word (str (uuid .uuid4 ()), columns_bbox , config .table_row_suffix , is_structure = True ))
753
+ if local_config .table_row_suffix and local_config .add_prefixes_and_suffixes_as_words :
754
+ words .append (Word (str (uuid .uuid4 ()), columns_bbox , local_config .table_row_suffix , is_structure = True ))
754
755
for _ , cells in rows [row_offset :]:
755
- if config .table_row_prefix and config .add_prefixes_and_suffixes_as_words :
756
- words .append (Word (str (uuid .uuid4 ()), BoundingBox .enclosing_bbox (cells ), config .table_row_prefix , is_structure = True ))
756
+ if local_config .table_row_prefix and local_config .add_prefixes_and_suffixes_as_words :
757
+ words .append (Word (str (uuid .uuid4 ()), BoundingBox .enclosing_bbox (cells ), local_config .table_row_prefix , is_structure = True ))
757
758
for cell in sorted (cells , key = lambda c : c .col_index ):
758
759
# Siblings includes the current cell
759
760
if cell .siblings :
@@ -765,35 +766,35 @@ def get_text_and_words(
765
766
row_index = first_row
766
767
row_span = last_row - first_row + 1
767
768
children = []
768
- if (cell .col_index == first_col and cell .row_index == first_row ) or config .table_duplicate_text_in_merged_cells :
769
+ if (cell .col_index == first_col and cell .row_index == first_row ) or local_config .table_duplicate_text_in_merged_cells :
769
770
for sib in cell .siblings :
770
771
children .extend (sib .children )
771
772
processed_cells .add (sib )
772
773
_ , cell_words = linearize_children (children , config = config , no_new_lines = True )
773
- elif cell .row_index == first_row and config .table_cell_left_merge_cell_placeholder :
774
+ elif cell .row_index == first_row and local_config .table_cell_left_merge_cell_placeholder :
774
775
# Left-merge token
775
776
cell_words = [
776
777
Word (str (uuid .uuid4 ()),
777
778
cell_bbox ,
778
- config .table_cell_left_merge_cell_placeholder ,
779
+ local_config .table_cell_left_merge_cell_placeholder ,
779
780
is_structure = True
780
781
)
781
782
]
782
- elif cell .col_index == first_col and config .table_cell_top_merge_cell_placeholder :
783
+ elif cell .col_index == first_col and local_config .table_cell_top_merge_cell_placeholder :
783
784
# Top-merge token
784
785
cell_words = [
785
786
Word (str (uuid .uuid4 ()),
786
787
cell_bbox ,
787
- config .table_cell_top_merge_cell_placeholder ,
788
+ local_config .table_cell_top_merge_cell_placeholder ,
788
789
is_structure = True
789
790
)
790
791
]
791
- elif cell .col_index != first_col and cell .row_index != first_row and config .table_cell_cross_merge_cell_placeholder :
792
+ elif cell .col_index != first_col and cell .row_index != first_row and local_config .table_cell_cross_merge_cell_placeholder :
792
793
# Cross-merge token (left and top)
793
794
cell_words = [
794
795
Word (str (uuid .uuid4 ()),
795
796
cell_bbox ,
796
- config .table_cell_cross_merge_cell_placeholder ,
797
+ local_config .table_cell_cross_merge_cell_placeholder ,
797
798
is_structure = True
798
799
)
799
800
]
@@ -807,15 +808,15 @@ def get_text_and_words(
807
808
row_index = cell .row_index
808
809
row_span = cell .row_span
809
810
_ , cell_words = cell .get_text_and_words (config )
810
- if config .add_prefixes_and_suffixes_as_words :
811
- if config .table_cell_prefix or (config .table_cell_header_prefix and cell .is_column_header ):
811
+ if local_config .add_prefixes_and_suffixes_as_words :
812
+ if local_config .table_cell_prefix or (local_config .table_cell_header_prefix and cell .is_column_header ):
812
813
words .append (
813
814
Word (
814
815
str (uuid .uuid4 ()),
815
816
cell_bbox ,
816
- config .table_cell_header_prefix
817
- if cell .is_column_header and config .table_cell_header_prefix
818
- else config .table_cell_prefix ,
817
+ local_config .table_cell_header_prefix
818
+ if cell .is_column_header and local_config .table_cell_header_prefix
819
+ else local_config .table_cell_prefix ,
819
820
is_structure = True
820
821
)
821
822
)
@@ -827,15 +828,15 @@ def get_text_and_words(
827
828
words [- 1 ].row_span = row_span
828
829
829
830
words .extend (cell_words )
830
- if not cell_words and config .table_cell_empty_cell_placeholder :
831
- words .append (Word (str (uuid .uuid4 ()), cell_bbox , config .table_cell_empty_cell_placeholder ))
831
+ if not cell_words and local_config .table_cell_empty_cell_placeholder :
832
+ words .append (Word (str (uuid .uuid4 ()), cell_bbox , local_config .table_cell_empty_cell_placeholder ))
832
833
833
- if config .table_cell_suffix or (config .table_cell_header_suffix and cell .is_column_header ):
834
+ if local_config .table_cell_suffix or (local_config .table_cell_header_suffix and cell .is_column_header ):
834
835
words .append (
835
836
Word (
836
837
str (uuid .uuid4 ()),
837
838
cell_bbox ,
838
- config .table_cell_header_suffix if cell .is_column_header and config .table_cell_header_suffix else config .table_cell_suffix ,
839
+ local_config .table_cell_header_suffix if cell .is_column_header and local_config .table_cell_header_suffix else local_config .table_cell_suffix ,
839
840
is_structure = True
840
841
)
841
842
)
@@ -847,38 +848,37 @@ def get_text_and_words(
847
848
words [- 1 ].row_span = row_span
848
849
else :
849
850
words .extend (cell_words )
850
- if config .table_row_suffix and config .add_prefixes_and_suffixes_as_words :
851
- words .append (Word (str (uuid .uuid4 ()), BoundingBox .enclosing_bbox (cells ), config .table_row_suffix , is_structure = True ))
851
+ if local_config .table_row_suffix and local_config .add_prefixes_and_suffixes_as_words :
852
+ words .append (Word (str (uuid .uuid4 ()), BoundingBox .enclosing_bbox (cells ), local_config .table_row_suffix , is_structure = True ))
852
853
853
- if config .table_suffix :
854
- words .append (Word (str (uuid .uuid4 ()), self .bbox , config .table_suffix ))
854
+ if local_config .table_suffix :
855
+ words .append (Word (str (uuid .uuid4 ()), self .bbox , local_config .table_suffix ))
855
856
856
857
for w in words :
857
858
w .table_id = str (self .id )
858
859
w .table_bbox = self .bbox
859
860
860
- text = (config .table_prefix if config .add_prefixes_and_suffixes_in_text else "" )
861
+ text = (local_config .table_prefix if local_config .add_prefixes_and_suffixes_in_text else "" )
861
862
# Markdown
862
- if config .table_linearization_format == "markdown" :
863
+ if local_config .table_linearization_format == "markdown" :
863
864
df = self .to_pandas (
864
865
use_columns = True ,
865
866
config = config
866
867
)
867
868
has_column = any ([isinstance (c , str ) for c in df .columns ])
868
- if config .table_remove_column_headers :
869
+ if local_config .table_remove_column_headers :
869
870
headers = df .columns if has_column else ["" for c in df .columns ]
870
871
else :
871
872
headers = df .columns
872
873
table = df .to_markdown (
873
- tablefmt = config .table_tabulate_format , headers = headers , index = False
874
+ tablefmt = local_config .table_tabulate_format , headers = headers , index = False
874
875
)
875
- if config .table_tabulate_remove_extra_hyphens :
876
+ if local_config .table_tabulate_remove_extra_hyphens :
876
877
while "-" * 2 in table :
877
878
table = table .replace ("--" , "-" )
878
879
text += table
879
880
# Plaintext or HTML
880
881
else :
881
- local_config = deepcopy (config )
882
882
# FIXME: The cyclomatic complexity of doing things like this will be unsustainable.
883
883
if local_config .table_flatten_semi_structured_as_plaintext and self .table_type == TableTypes .SEMI_STRUCTURED :
884
884
text = "<p>"
@@ -1030,7 +1030,7 @@ def get_text_and_words(
1030
1030
text += (local_config .table_row_suffix if local_config .add_prefixes_and_suffixes_in_text else "" )
1031
1031
text += local_config .table_row_separator
1032
1032
1033
- if local_config .table_add_title_as_caption and self .title :
1033
+ if local_config .table_add_title_as_caption and self .title and local_config . table_linearization_format == "html" :
1034
1034
text += "<caption>" + self .title .get_text () + "</caption>"
1035
1035
1036
1036
text += (local_config .table_suffix if local_config .add_prefixes_and_suffixes_in_text else "" )
0 commit comments