@@ -135,6 +135,11 @@ def export_text_to_json(image_text, extracted_text):
135135 "reason" : "Something failed in reading and parsing the pdf. See error logs for more info" ,
136136 }
137137
138+ # Check type of pdf_data["data"]
139+ if not isinstance (pdf_data ["data" ], bytes ):
140+ self .logger .info ("Encoding data to bytes for the bytestream reader" )
141+ pdf_data ["data" ] = pdf_data ["data" ].encode ()
142+
138143 # Make a tempfile for the file data from self.get_file
139144 # Make a tempfile with tempfile library
140145 with tempfile .NamedTemporaryFile () as temp :
@@ -162,12 +167,24 @@ def export_text_to_json(image_text, extracted_text):
162167
163168 def extract_text_from_image (self , file_id ):
164169 # Check if it's a pdf
170+
171+ pdf_data = self .get_file (file_id )
172+ if "filename" not in pdf_data :
173+ available_fields = []
174+ for key , value in pdf_data .items ():
175+ available_fields .append (key )
176+
177+ return {
178+ "success" : False ,
179+ "reason" : "File not found" ,
180+ "details" : f"Available fields: { available_fields } " ,
181+ }
182+
165183 # If it is, use extract_text_from_pdf
166184 # If it's not, use pytesseract
167- if self . get_file ( file_id )[ "name " ].endswith (".pdf" ):
185+ if pdf_data [ "filename " ].endswith (".pdf" ):
168186 return self .extract_text_from_pdf (file_id )
169187
170- pdf_data = self .get_file (file_id )
171188 defaultdata = {
172189 "success" : False ,
173190 "file_id" : file_id ,
0 commit comments