Skip to content

Commit 94ac825

Browse files
committed
Add legacy check to parser
1 parent 1998338 commit 94ac825

File tree

2 files changed

+53
-2
lines changed

2 files changed

+53
-2
lines changed

textractor/parsers/response_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
LAYOUT_TABLE,
6767
LAYOUT_KEY_VALUE,
6868
)
69+
from textractor.utils.legacy_utils import converter
6970

7071
THRESHOLD = 0.95
7172

@@ -1542,7 +1543,6 @@ def parser_analyze_expense_response(response):
15421543
document.response = response
15431544
return document
15441545

1545-
15461546
def parse(response: dict) -> Document:
15471547
"""
15481548
Ingests response data and API Call Mode and calls the appropriate function for it.
@@ -1559,4 +1559,4 @@ def parse(response: dict) -> Document:
15591559
if "ExpenseDocuments" in response:
15601560
return parser_analyze_expense_response(response)
15611561
else:
1562-
return parse_document_api_response(response)
1562+
return parse_document_api_response(converter(response))

textractor/utils/legacy_utils.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from textractor.data.constants import (
2+
LAYOUT_FIGURE,
3+
LAYOUT_LIST,
4+
LAYOUT_TABLE,
5+
LAYOUT_KEY_VALUE,
6+
LAYOUT_TEXT,
7+
LAYOUT_TITLE,
8+
LAYOUT_HEADER,
9+
LAYOUT_FOOTER,
10+
LAYOUT_SECTION_HEADER,
11+
LAYOUT_PAGE_NUMBER,
12+
)
13+
14+
def converter(response):
15+
blocks_to_delete = []
16+
page_block = None
17+
for i, block in enumerate(response["Blocks"]):
18+
if block.get("BlockType") == "PAGE":
19+
page_block = block
20+
elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"):
21+
block["BlockType"] = LAYOUT_TEXT
22+
elif (
23+
block.get("BlockType", "").startswith("LAYOUT_") and
24+
block.get("BlockType") not in [
25+
LAYOUT_TEXT,
26+
LAYOUT_TITLE,
27+
LAYOUT_HEADER,
28+
LAYOUT_FOOTER,
29+
LAYOUT_SECTION_HEADER,
30+
LAYOUT_PAGE_NUMBER,
31+
LAYOUT_LIST,
32+
LAYOUT_FIGURE,
33+
LAYOUT_TABLE,
34+
LAYOUT_KEY_VALUE,
35+
]
36+
):
37+
block["BlockType"] = LAYOUT_FIGURE
38+
elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []):
39+
blocks_to_delete.append((i, block))
40+
41+
page_relationships = []
42+
for relationship in page_block["Relationships"]:
43+
if relationship["Type"] == "CHILD":
44+
page_relationships = relationship["Ids"]
45+
break
46+
47+
for i, block in blocks_to_delete[::-1]:
48+
del response["Blocks"][i]
49+
page_relationships.remove(block["Id"])
50+
51+
return response

0 commit comments

Comments
 (0)