Skip to content

Commit 60fb427

Browse files
authored
Fix missing figures
2 parents 83d476c + 5dc1b1a commit 60fb427

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

textractor/parsers/response_parser.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from textractor.entities.layout import Layout
3939
from textractor.data.constants import (
4040
LAYOUT_ENTITY,
41+
LAYOUT_FIGURE,
4142
TABLE_FOOTER,
4243
TABLE_TITLE,
4344
COLUMN_HEADER,
@@ -1282,7 +1283,7 @@ def parse_document_api_response(response: dict) -> Document:
12821283
key_values.remove(kv)
12831284

12841285

1285-
page.leaf_layouts = [l for l in page.leaf_layouts if l.children]
1286+
page.leaf_layouts = [l for l in page.leaf_layouts if l.children or l.layout_type == LAYOUT_FIGURE]
12861287

12871288
# We create layout elements for the KeyValues that did not match to a layout element in the
12881289
# previous step
@@ -1334,7 +1335,7 @@ def parse_document_api_response(response: dict) -> Document:
13341335
words_in_sub_layouts.add(w)
13351336
for word in words_in_sub_layouts:
13361337
layout.remove(word)
1337-
if not layout.children:
1338+
if not layout.children and layout.layout_type != LAYOUT_FIGURE:
13381339
layouts_to_remove.append(layout)
13391340

13401341
# Clean up layouts that became empty due to the previous step.
@@ -1380,7 +1381,7 @@ def parse_document_api_response(response: dict) -> Document:
13801381
word_set = set()
13811382
for layout in sorted(page.layouts, key=lambda l: l.reading_order):
13821383
layout.visit(word_set)
1383-
if not layout.children:
1384+
if not layout.children and layout.layout_type != LAYOUT_FIGURE:
13841385
try:
13851386
page.leaf_layouts.remove(layout)
13861387
except:

0 commit comments

Comments
 (0)