Skip to content

Commit dfa21f2

Browse files
committed
Shuffle-AI mods
1 parent e96348c commit dfa21f2

File tree

2 files changed

+20
-3
lines changed

2 files changed

+20
-3
lines changed

shuffle-ai/1.0.0/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ FROM frikky/shuffle:app_sdk as base
55
FROM base as builder
66

77
# Install all alpine build tools needed for our pip installs
8-
RUN apk --no-cache add --update alpine-sdk libffi libffi-dev musl-dev openssl-dev git
8+
RUN apk --no-cache add --update alpine-sdk libffi libffi-dev musl-dev openssl-dev git poppler-utils
99

1010
# Install all of our pip packages in a single directory that we can copy to our base image later
1111
RUN mkdir /install

shuffle-ai/1.0.0/src/app.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ def export_text_to_json(image_text, extracted_text):
135135
"reason": "Something failed in reading and parsing the pdf. See error logs for more info",
136136
}
137137

138+
# Check type of pdf_data["data"]
139+
if not isinstance(pdf_data["data"], bytes):
140+
self.logger.info("Encoding data to bytes for the bytestream reader")
141+
pdf_data["data"] = pdf_data["data"].encode()
142+
138143
# Make a tempfile for the file data from self.get_file
139144
# Make a tempfile with tempfile library
140145
with tempfile.NamedTemporaryFile() as temp:
@@ -162,12 +167,24 @@ def export_text_to_json(image_text, extracted_text):
162167

163168
def extract_text_from_image(self, file_id):
164169
# Check if it's a pdf
170+
171+
pdf_data = self.get_file(file_id)
172+
if "filename" not in pdf_data:
173+
available_fields = []
174+
for key, value in pdf_data.items():
175+
available_fields.append(key)
176+
177+
return {
178+
"success": False,
179+
"reason": "File not found",
180+
"details": f"Available fields: {available_fields}",
181+
}
182+
165183
# If it is, use extract_text_from_pdf
166184
# If it's not, use pytesseract
167-
if self.get_file(file_id)["name"].endswith(".pdf"):
185+
if pdf_data["filename"].endswith(".pdf"):
168186
return self.extract_text_from_pdf(file_id)
169187

170-
pdf_data = self.get_file(file_id)
171188
defaultdata = {
172189
"success": False,
173190
"file_id": file_id,

0 commit comments

Comments
 (0)