Skip to content

Commit 6d136dc

Browse files
authored
Merge pull request alephdata#577 from alephdata/bugfix/573-respect-convert-timeout
Respect the timeout and retry settings for converting documents
2 parents 9d62633 + 7d068f8 commit 6d136dc

File tree

2 files changed

+19
-29
lines changed

2 files changed

+19
-29
lines changed

ingestors/settings.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44

55
TESTING = False
66

7-
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs
8-
CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256)
7+
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 300) # seconds
98

109
# Enable (expensive!) Google Cloud API
1110
OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False)

ingestors/support/convert.py

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,10 @@
99
from ingestors.support.cache import CacheSupport
1010
from ingestors.support.temp import TempFileSupport
1111
from ingestors.exc import ProcessingException
12+
from ingestors import settings
1213

1314
log = logging.getLogger(__name__)
1415

15-
TIMEOUT = 3600 # seconds
16-
CONVERT_RETRIES = 5
1716

1817
PDF_CACHE_ACCESSED = Counter(
1918
"ingestfile_pdf_cache_accessed",
@@ -45,7 +44,9 @@ def document_to_pdf(self, unique_tmpdir, file_path, entity):
4544
self.tags.set(key, content_hash)
4645
return pdf_file
4746

48-
def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT):
47+
def _document_to_pdf(
48+
self, unique_tmpdir, file_path, entity, timeout=settings.CONVERT_TIMEOUT
49+
):
4950
"""Converts an office document to PDF."""
5051
file_name = entity_filename(entity)
5152
log.info("Converting [%s] to PDF", entity)
@@ -72,30 +73,20 @@ def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT):
7273
file_path,
7374
]
7475
try:
75-
for attempt in range(1, CONVERT_RETRIES):
76-
log.info(
77-
f"Starting LibreOffice: %s with timeout %s attempt #{attempt}/{CONVERT_RETRIES}",
78-
cmd,
79-
timeout,
80-
)
81-
try:
82-
subprocess.run(cmd, timeout=timeout, check=True)
83-
except Exception as e:
84-
log.info(
85-
f"Could not be converted to PDF (attempt {attempt}/{CONVERT_RETRIES}): {e}"
86-
)
87-
continue
76+
log.info(f"Starting LibreOffice: {cmd} with timeout {timeout}")
77+
try:
78+
subprocess.run(cmd, timeout=timeout, check=True)
79+
except Exception as e:
80+
raise ProcessingException("Could not be converted to PDF") from e
8881

89-
for file_name in os.listdir(pdf_output_dir):
90-
if not file_name.endswith(".pdf"):
91-
continue
92-
out_file = os.path.join(pdf_output_dir, file_name)
93-
if os.stat(out_file).st_size == 0:
94-
continue
95-
log.info(f"Successfully converted {out_file}")
96-
return out_file
97-
raise ProcessingException(
98-
f"Could not be converted to PDF (attempt #{attempt}/{CONVERT_RETRIES})"
99-
)
82+
for file_name in os.listdir(pdf_output_dir):
83+
if not file_name.endswith(".pdf"):
84+
continue
85+
out_file = os.path.join(pdf_output_dir, file_name)
86+
if os.stat(out_file).st_size == 0:
87+
continue
88+
log.info(f"Successfully converted {out_file}")
89+
return out_file
90+
raise ProcessingException("Could not be converted to PDF")
10091
except Exception as e:
10192
raise ProcessingException("Could not be converted to PDF") from e

0 commit comments

Comments
 (0)