Skip to content

Commit e734b90

Browse files
committed
Refactor ingest-file to be compatible with nomenklatura
1 parent a8c23fb commit e734b90

File tree

6 files changed

+125
-10
lines changed

6 files changed

+125
-10
lines changed

Dockerfile

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM python:3.9-bookworm
1+
FROM python:3.11-slim
22
ENV DEBIAN_FRONTEND noninteractive
33

44
LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
@@ -10,7 +10,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
1010
&& apt-get -qq -y update \
1111
&& apt-get -qq -y install build-essential locales \
1212
# python deps (mostly to install their dependencies)
13-
python3-dev \
13+
git python3-dev \
14+
pkg-config libicu-dev \
1415
# tesseract
1516
tesseract-ocr libtesseract-dev libleptonica-dev \
1617
# libraries
@@ -24,6 +25,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
2425
libtiff5-dev \
2526
libtiff-tools ghostscript librsvg2-bin jbig2dec \
2627
pst-utils libgif-dev \
28+
# necessary for python-magic
29+
libmagic1 \
2730
### tesseract
2831
tesseract-ocr-eng \
2932
tesseract-ocr-swa \
@@ -121,6 +124,8 @@ RUN mkdir /models/ && \
121124
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
122125

123126
COPY requirements.txt /tmp/
127+
RUN pip3 install --no-cache-dir -q -U pip setuptools
128+
RUN pip3 install --no-binary=:pyicu: pyicu
124129
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
125130

126131
# Install spaCy models
@@ -143,7 +148,7 @@ RUN python3 -m spacy download el_core_news_sm \
143148

144149
COPY . /ingestors
145150
WORKDIR /ingestors
146-
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
151+
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
147152
RUN chown -R app:app /ingestors
148153

149154
ENV ARCHIVE_TYPE=file \

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ services:
1616
ingest-file:
1717
build:
1818
context: .
19-
image: ghcr.io/alephdata/ingest-file
19+
# image: ghcr.io/alephdata/ingest-file
2020
hostname: ingest
2121
tmpfs:
2222
- /tmp:mode=777

ingestors/analysis/extract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from functools import lru_cache
44
from normality import collapse_spaces
55
from languagecodes import list_to_alpha3
6-
from fingerprints import clean_entity_name
6+
from fingerprints import clean_entity_prefix
77
from followthemoney.types import registry
88

99
from ingestors import settings
@@ -27,7 +27,7 @@
2727
def clean_name(text):
2828
if text is None or len(text) > NAME_MAX_LENGTH:
2929
return
30-
text = clean_entity_name(text)
30+
text = clean_entity_prefix(text)
3131
text = collapse_spaces(text)
3232
if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
3333
return

ingestors/manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
193193
now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")
194194

195195
entity.set("processingStatus", self.STATUS_FAILURE)
196-
entity.set("processingAgent", get_distribution("ingest").version)
196+
entity.set("processingAgent", get_distribution("ingestors").version)
197197
entity.set("processedAt", now_string)
198198

199199
ingestor_class = None

pyproject.toml

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
[project]
2+
name = "ingestors"
3+
version = "3.22.0"
4+
description = "Ingestors extract the contents of mixed unstructured documents into structured (followthemoney) data. "
5+
authors = [
6+
{ name = "Friedrich Lindenberg", email = "friedrich@pudo.org" },
7+
{ name = "OCCRP Data Team", email = "data@occrp.org" },
8+
{ name = "ID.IO", email = "hi@investigativedata.org" },
9+
]
10+
readme = "README.md"
11+
license = "AGPL-3.0"
12+
requires-python = ">=3.11,<4.0"
13+
dependencies = [
14+
"banal (==1.0.6)",
15+
"normality (==2.5.0)",
16+
"pantomime (==0.6.1)",
17+
"followthemoney (==3.5.9)",
18+
"followthemoney-store[postgresql] (>=3.1.0,<3.2.0)",
19+
"servicelayer @ git+https://github.com/investigativedata/servicelayer.git@main",
20+
"languagecodes (==1.1.1)",
21+
"countrytagger (==0.1.2)",
22+
"pyicu (==2.12)",
23+
"google-cloud-vision (==3.7.2)",
24+
"tesserocr (==2.7.1)",
25+
"spacy (==3.6.1)",
26+
"numpy (<2.0)",
27+
"fingerprints (==1.2.3)",
28+
"fasttext (==0.9.2)",
29+
"pika (==1.3.2)",
30+
"nomenklatura (==3.15.2)",
31+
"dbf (==0.99.9)",
32+
"pymediainfo (==6.1.0)",
33+
"python-magic (==0.4.27)",
34+
"rarfile (==4.2)",
35+
"xlrd (==2.0.1)",
36+
"openpyxl (==3.1.2)",
37+
"odfpy (==1.4.1)",
38+
"faust-cchardet (==2.1.19)",
39+
"lxml (==5.0.0)",
40+
"olefile (==0.47)",
41+
"Pillow (==10.1.0)",
42+
"vobject (==0.9.6.1)",
43+
"msglite (==0.30.0)",
44+
"icalendar (==5.0.12)",
45+
"cryptography (==41.0.7)",
46+
"requests[security] (==2.31.0)",
47+
"pymupdf (==1.21.1)",
48+
"prometheus-client (==0.17.1)",
49+
"sentry_sdk (==2.0.1)",
50+
# servicelayer extras requirements
51+
"boto3 (>=1.11.9,<2.0.0)",
52+
"grpcio (>=1.32.0,<2.0.0)",
53+
"google-cloud-storage (>=1.31.0,<3.0.0)"
54+
]
55+
56+
[project.scripts]
57+
ingestors = "ingestors.cli:cli"
58+
59+
[project.gui-scripts]
60+
ingestors = "ingestors.cli:cli"
61+
62+
[project.entry-points."ingestors"]
63+
ignore = "ingestors.ignore:IgnoreIngestor"
64+
html = "ingestors.documents.html:HTMLIngestor"
65+
xml = "ingestors.documents.xml:XMLIngestor"
66+
plain = "ingestors.documents.plain:PlainTextIngestor"
67+
office = "ingestors.documents.office:DocumentIngestor"
68+
opendoc = "ingestors.documents.opendoc:OpenDocumentIngestor"
69+
ooxml = "ingestors.documents.ooxml:OfficeOpenXMLIngestor"
70+
djvu = "ingestors.documents.djvu:DjVuIngestor"
71+
pdf = "ingestors.documents.pdf:PDFIngestor"
72+
rar = "ingestors.packages.rar:RARIngestor"
73+
zip = "ingestors.packages.zip:ZipIngestor"
74+
tar = "ingestors.packages.tar:TarIngestor"
75+
7z = "ingestors.packages:SevenZipIngestor"
76+
gz = "ingestors.packages:GzipIngestor"
77+
bz2 = "ingestors.packages:BZ2Ingestor"
78+
pst = "ingestors.email.outlookpst:OutlookPSTIngestor"
79+
olm = "ingestors.email.olm:OutlookOLMArchiveIngestor"
80+
opfmsg = "ingestors.email.olm:OutlookOLMMessageIngestor"
81+
olemsg = "ingestors.email.outlookmsg:OutlookMsgIngestor"
82+
msg = "ingestors.email.msg:RFC822Ingestor"
83+
emlx = "ingestors.email.emlx:AppleEmlxIngestor"
84+
vcard = "ingestors.email.vcard:VCardIngestor"
85+
calendar = "ingestors.email.calendar:CalendarIngestor"
86+
csv = "ingestors.tabular.csv:CSVIngestor"
87+
access = "ingestors.tabular.access:AccessIngestor"
88+
sqlite = "ingestors.tabular.sqlite:SQLiteIngestor"
89+
xls = "ingestors.tabular.xls:ExcelIngestor"
90+
xlsx = "ingestors.tabular.xlsx:ExcelXMLIngestor"
91+
ods = "ingestors.tabular.ods:OpenOfficeSpreadsheetIngestor"
92+
mbox = "ingestors.email.mbox:MboxFileIngestor"
93+
dbf = "ingestors.tabular.dbf:DBFIngestor"
94+
image = "ingestors.media.image:ImageIngestor"
95+
tiff = "ingestors.media.tiff:TIFFIngestor"
96+
svg = "ingestors.media.svg:SVGIngestor"
97+
audio = "ingestors.media.audio:AudioIngestor"
98+
video = "ingestors.media.video:VideoIngestor"
99+
json = "ingestors.misc.jsonfile:JSONIngestor"
100+
101+
[build-system]
102+
requires = ["poetry-core"]
103+
build-backend = "poetry.core.masonry.api"
104+
105+
[tool.poetry.group.dev.dependencies]
106+
pytest = "8.2.0"
107+
pytest-cov = "5.0.0"
108+
click = "8.1.7"

requirements.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@ normality==2.5.0
33
pantomime==0.6.1
44
followthemoney==3.5.9
55
followthemoney-store[postgresql]==3.1.0
6-
servicelayer[google,amazon]==1.23.0
6+
servicelayer @ git+https://github.com/investigativedata/servicelayer.git
77
languagecodes==1.1.1
88
countrytagger==0.1.2
99
pyicu==2.12
1010
google-cloud-vision==3.7.2
1111
tesserocr==2.7.1
1212
spacy==3.6.1 # pinned because spacy 3.8 requires numpy >2 which breaks fasttext (see https://groups.google.com/g/fasttext-library/c/4EOM0-S6xHU)
1313
numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy
14-
fingerprints==1.1.1
14+
fingerprints==1.2.3
1515
fasttext==0.9.2
1616
pika==1.3.2
17+
nomenklatura==3.15.2
1718

1819
# Development
1920
pytest==8.2.0
@@ -28,7 +29,7 @@ rarfile==4.2
2829
xlrd==2.0.1
2930
openpyxl==3.1.2
3031
odfpy==1.4.1
31-
cchardet==2.1.7
32+
faust-cchardet==2.1.19
3233
lxml==5.0.0
3334
olefile==0.47
3435
Pillow==10.1.0
@@ -42,3 +43,4 @@ pymupdf==1.21.1
4243

4344
prometheus-client==0.17.1
4445
sentry_sdk==2.0.1
46+

0 commit comments

Comments
 (0)