1- FROM ubuntu:20.04
1+ FROM python:3.9-bookworm
22ENV DEBIAN_FRONTEND noninteractive
33
44LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
55LABEL org.opencontainers.image.licenses MIT
66LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
77
88# Enable non-free archive for `unrar`.
9- # RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list
10- RUN apt-get -qq -y update \
11- && apt-get -qq -y install build-essential locales ca-certificates \
9+ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
10+ && apt-get -qq -y update \
11+ && apt-get -qq -y install build-essential locales \
1212 # python deps (mostly to install their dependencies)
13- python3-pip python3- dev python3-pil \
13+ python3-dev \
1414 # tesseract
15- tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
15+ tesseract-ocr libtesseract-dev libleptonica-dev \
1616 # libraries
17- libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
18- zlib1g-dev libicu-dev libxml2-dev \
17+ libldap2-dev libsasl2-dev \
1918 # package tools
2019 unrar p7zip-full \
2120 # audio & video metadata
2221 libmediainfo-dev \
2322 # image processing, djvu
24- imagemagick-common imagemagick mdbtools djvulibre-bin \
25- libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \
23+ mdbtools djvulibre-bin \
24+ libtiff5-dev \
2625 libtiff-tools ghostscript librsvg2-bin jbig2dec \
27- pst-utils \
26+ pst-utils libgif-dev \
2827 # ## tesseract
2928 tesseract-ocr-eng \
3029 tesseract-ocr-swa \
@@ -98,7 +97,7 @@ RUN apt-get -qq -y update \
9897 tesseract-ocr-uzb \
9998 # ## pdf convert: libreoffice + a bunch of fonts
10099 libreoffice fonts-opensymbol hyphen-fr hyphen-de \
101- hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu- extra \
100+ hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-extra \
102101 fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
103102 fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
104103 fonts-tlwg-purisa \
@@ -121,11 +120,7 @@ RUN groupadd -g 1000 -r app \
121120RUN mkdir /models/ && \
122121 curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
123122
124- # Having updated pip/setuptools seems to break the test run for some reason (12/01/2022)
125- # RUN pip3 install --no-cache-dir -U pip setuptools
126123COPY requirements.txt /tmp/
127- RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
128- RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
129124RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
130125
131126# Install spaCy models
@@ -148,14 +143,15 @@ RUN python3 -m spacy download el_core_news_sm \
148143
149144COPY . /ingestors
150145WORKDIR /ingestors
151- RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
146+ RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
152147RUN chown -R app:app /ingestors
153148
154149ENV ARCHIVE_TYPE=file \
155150 ARCHIVE_PATH=/data \
156151 FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
157152 REDIS_URL=redis://redis:6379/0 \
158- TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
153+ TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \
154+ LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
159155
160156# USER app
161157CMD ingestors process
0 commit comments