Skip to content

Commit f5cf4dd

Browse files
committed
Respect the timeout and retry settings for converting docs
Should fix #573
1 parent 9d62633 commit f5cf4dd

File tree

2 files changed

+10
-11
lines changed

2 files changed

+10
-11
lines changed

ingestors/settings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
TESTING = False
66

7-
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs
8-
CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256)
7+
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 300) # seconds
8+
CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 3)
99

1010
# Enable (expensive!) Google Cloud API
1111
OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False)

ingestors/support/convert.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,10 @@
99
from ingestors.support.cache import CacheSupport
1010
from ingestors.support.temp import TempFileSupport
1111
from ingestors.exc import ProcessingException
12+
from ingestors import settings
1213

1314
log = logging.getLogger(__name__)
1415

15-
TIMEOUT = 3600 # seconds
16-
CONVERT_RETRIES = 5
1716

1817
PDF_CACHE_ACCESSED = Counter(
1918
"ingestfile_pdf_cache_accessed",
@@ -45,7 +44,9 @@ def document_to_pdf(self, unique_tmpdir, file_path, entity):
4544
self.tags.set(key, content_hash)
4645
return pdf_file
4746

48-
def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT):
47+
def _document_to_pdf(
48+
self, unique_tmpdir, file_path, entity, timeout=settings.CONVERT_TIMEOUT
49+
):
4950
"""Converts an office document to PDF."""
5051
file_name = entity_filename(entity)
5152
log.info("Converting [%s] to PDF", entity)
@@ -72,17 +73,15 @@ def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT):
7273
file_path,
7374
]
7475
try:
75-
for attempt in range(1, CONVERT_RETRIES):
76+
for attempt in range(1, settings.CONVERT_RETRIES):
7677
log.info(
77-
f"Starting LibreOffice: %s with timeout %s attempt #{attempt}/{CONVERT_RETRIES}",
78-
cmd,
79-
timeout,
78+
f"Starting LibreOffice: {cmd} with timeout {timeout} attempt #{attempt}/{settings.CONVERT_RETRIES}",
8079
)
8180
try:
8281
subprocess.run(cmd, timeout=timeout, check=True)
8382
except Exception as e:
8483
log.info(
85-
f"Could not be converted to PDF (attempt {attempt}/{CONVERT_RETRIES}): {e}"
84+
f"Could not be converted to PDF (attempt {attempt}/{settings.CONVERT_RETRIES}): {e}"
8685
)
8786
continue
8887

@@ -95,7 +94,7 @@ def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT):
9594
log.info(f"Successfully converted {out_file}")
9695
return out_file
9796
raise ProcessingException(
98-
f"Could not be converted to PDF (attempt #{attempt}/{CONVERT_RETRIES})"
97+
f"Could not be converted to PDF (attempt #{attempt}/{settings.CONVERT_RETRIES})"
9998
)
10099
except Exception as e:
101100
raise ProcessingException("Could not be converted to PDF") from e

0 commit comments

Comments
 (0)