Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions enthic/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging
import logging.config
import os
from json import load
from os.path import dirname, join
from pathlib import Path

from dotenv import find_dotenv, load_dotenv
Expand All @@ -14,15 +12,12 @@
)


with open(join(dirname(__file__), "configuration.json")) as json_configuration_file:
CONFIG = load(json_configuration_file)


class Config:
FTP_MAX_VOLUME = os.environ.get("FTP_MAX_VOLUME", 6 * 1024 * 1024 * 1024)
DATADIR = Path(os.environ.get("DATADIR", Path(__file__).parent / ".."))
BUNDLE_RAW_DIR = (
Path(os.environ.get("DATADIR", Path(__file__).parent / "..")) / "bundles"
BUNDLE_RAW_DIR = Path(
os.environ.get(
"BUNDLE_RAW_DIR", Path(__file__).parent / ".." / "data" / "bundles"
)
)
INSEE_KEY = CONFIG["INSEE"]["KEY"]
INSEE_SECRET = CONFIG["INSEE"]["SECRET"]
INSEE_KEY = os.environ.get("INSEE_KEY")
INSEE_SECRET = os.environ.get("INSEE_SECRET")
207 changes: 50 additions & 157 deletions enthic/scraping/extract_bundle.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,26 @@
import datetime
import json
import logging
import xml.etree.ElementTree as ElementTree
from csv import reader
from io import BytesIO
from logging import debug, info
from os import listdir
from os.path import dirname, join
from pathlib import Path
from pprint import pprint
from re import compile, sub
from zipfile import BadZipFile, ZipFile

from enthic.scraping.insee import (
get_siren_data_from_insee_api,
get_siret_data_from_insee_api,
)
from enthic.scraping.liasse import read_address_data
from enthic.scraping.liasse import Liasse, parse_xml_liasse, read_address_data
from enthic.utils.ape_utils import APE_CONVERSION
from enthic.utils.bundle_utils import ACCOUNTING_TYPE_CONVERSION, BUNDLE_CONVERSION
from enthic.utils.INPI_data_enhancer import decrypt_code_motif

from .accountability_metadata import AccountabilityMetadata, MetadataCase
from .database_requests_utils import (
SESSION,
get_metadata,
replace_bundle_into_database,
replace_metadata_ORM,
Expand Down Expand Up @@ -191,162 +188,58 @@ def read_identity_data(identity_xml_item, xml_file_name):
)


def process_xml_file(xml_stream, xml_name):
"""
Process an xml file already opened
"""
####################################################
# XML PARSER
try:
tree = ElementTree.parse(xml_stream)
except ElementTree.ParseError as error:
info("Error processing XML " + xml_name + f" : {error}")
return False
root = tree.getroot()
####################################################
# XML RELATED VARIABLES
acc_type, siren, year = (None,) * 3
####################################################
# ITERATE ALL TAGS
metadata_case = MetadataCase.IS_NEW
bundles_added_set = set()
for child in root[0]:
################################################
# IDENTITY TAGS, SIREN AND TYPE OF ACCOUNTABILITY
if child.tag == "{fr:inpi:odrncs:bilansSaisisXML}identite":
(
acc_type,
siren,
denomination,
year,
ape,
postal_code,
town,
code_motif,
code_confidentialite,
info_traitement,
duree_exercice,
date_cloture_exercice,
) = read_identity_data(child, xml_name)
############################################
# WRITE IDENTITY FILE IF ACCOUNT TYPE IS
# KNOWN
if acc_type not in ACC_ONT.keys():
return False
existing_metadata_list = get_metadata(siren)
new_metadata = AccountabilityMetadata(
siren=siren,
declaration=year,
duree_exercice=duree_exercice,
date_cloture_exercice=date_cloture_exercice,
code_motif=code_motif,
code_confidentialite=code_confidentialite,
info_traitement=info_traitement,
accountability=acc_type,
)
def process_xml_file(xml: str, xml_name: str):
liasse = parse_xml_liasse(xml)
if not liasse["bilan"]:
LOGGER.warning(
"No data extracted",
extra={"siren": liasse["siren"], "cloture": liasse["cloture"].isoformat()},
)
return
metadata_status = _update_metadata(liasse)
_save_accountability(metadata_status, liasse)

metadata_to_replace = None
for existing_metadata in existing_metadata_list:
result = new_metadata.compare(existing_metadata)
if result == MetadataCase.IGNORE:
return False
if result == MetadataCase.REPLACE:
metadata_to_replace = existing_metadata
metadata_case = result
if (
result != MetadataCase.IS_NEW
and metadata_case != MetadataCase.REPLACE
):
metadata_case = result

if len(existing_metadata_list) == 0:
save_company_to_database(
str(siren), str(denomination), str(ape), str(postal_code), str(town)
)
else:
print(
"New metadata",
new_metadata,
"different des metadata déjà en base. Action choisie :",
metadata_case,
)
pprint(existing_metadata_list)
def _update_metadata(liasse: Liasse):
# Add order by and get last only
# add selection on year
existing_metadata = get_metadata(liasse["siren"])
if not existing_metadata:
save_company_to_database(**liasse._to_identity())
return MetadataCase.IS_NEW

metadata = AccountabilityMetadata.from_liasse(liasse)
status = metadata.compare(existing_metadata)

if status == MetadataCase.IGNORE:
return status

if metadata_case == MetadataCase.REPLACE:
replace_metadata_ORM(new_metadata, metadata_to_replace)
else:
save_metadata_ORM(new_metadata)
################################################
# BUNDLE TAGS IN PAGES TO ITERATE WITH BUNDLE CODES
# AND AMOUNT
elif child.tag == "{fr:inpi:odrncs:bilansSaisisXML}detail":
for page in child:
for bundle in page:
try:
for bundle_code in ACC_ONT[acc_type]["bundleCodeAtt"]:
if bundle.attrib["code"] in bundle_code.keys():
for amount_code in bundle_code[bundle.attrib["code"]]:
amount_code = f"m{amount_code}"
if metadata_case == MetadataCase.COMPLEMENTARY:
sum_bundle_into_database(
siren,
str(year),
str(ACCOUNTING_TYPE_CONVERSION[acc_type]),
str(
BUNDLE_CONVERSION[
ACCOUNTING_TYPE_CONVERSION[acc_type]
][bundle.attrib["code"]]
),
str(int(bundle.attrib[amount_code])),
)
elif metadata_case == MetadataCase.REPLACE:
replace_bundle_into_database(
siren,
str(year),
str(ACCOUNTING_TYPE_CONVERSION[acc_type]),
str(
BUNDLE_CONVERSION[
ACCOUNTING_TYPE_CONVERSION[acc_type]
][bundle.attrib["code"]]
),
str(int(bundle.attrib[amount_code])),
False,
)
elif metadata_case == MetadataCase.IS_NEW:
new_bundle = (
siren,
str(year),
str(ACCOUNTING_TYPE_CONVERSION[acc_type]),
str(
BUNDLE_CONVERSION[
ACCOUNTING_TYPE_CONVERSION[acc_type]
][bundle.attrib["code"]]
),
str(int(bundle.attrib[amount_code])),
)
if new_bundle[:4] in bundles_added_set:
print(
"Bundle",
new_bundle,
"en double dans le fichier XML",
)
else:
bundles_added_set.add(new_bundle[:4])
save_bundle_to_database(
new_bundle[0],
new_bundle[1],
new_bundle[2],
new_bundle[3],
new_bundle[4],
)
except KeyError as key_error:
debug(
"{} in account {} bundle {}".format(
key_error, acc_type, bundle.attrib["code"]
)
)
SESSION.commit()
return True
if status == MetadataCase.REPLACE:
replace_metadata_ORM(metadata, existing_metadata)
else:
save_metadata_ORM(metadata)
return status


def _save_accountability(status, liasse):
opts = dict(
siren=liasse["siren"],
declaration=liasse["year"],
accountability=str(ACCOUNTING_TYPE_CONVERSION[liasse["type_bilan"]]),
)
for code, amount in liasse["bilan"].items():
bundle = str(
BUNDLE_CONVERSION[ACCOUNTING_TYPE_CONVERSION[liasse["type_bilan"]]][code]
)
amount = str(int(amount))
if status == MetadataCase.COMPLEMENTARY:
sum_bundle_into_database(bundle=bundle, amount=amount, **opts)
elif status == MetadataCase.REPLACE:
replace_bundle_into_database(
bundle=bundle, amount=amount, add_detail_mode=False, **opts
)
elif status == MetadataCase.IS_NEW:
save_bundle_to_database(bundle=bundle, amount=amount, **opts)


def process_daily_zip_file(daily_zip_file_path):
Expand Down
43 changes: 11 additions & 32 deletions enthic/scraping/liasse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
RE_POSTAL_CODE_TOWN = re.compile(
r"([0-9]+)[ -]?¨?([a-zA-Z0-9`ÀéÉèÈîÎ_ \'\"-\.\(\)\-]+)"
)
RE_TOWN = re.compile(r"([a-zA-Z0-9_ \'\"-\.\(\)\-]+)")
RE_POSTAL_CODE = re.compile(r"([0-9]+)")


class Liasse(dict):
Expand Down Expand Up @@ -86,33 +84,14 @@ def read_address_data(address_xml_item: str):

:param address_xml_item: the identity's address XMl object
"""
postal_code, town = (ModifiedData.ABSENT.value,) * 2
try:
regex_match = RE_POSTAL_CODE_TOWN.match(address_xml_item)
regex_match = RE_POSTAL_CODE_TOWN.match(address_xml_item)
if regex_match:
postal_code = regex_match.group(1)
town = regex_match.group(2).upper()
if not town.strip():
postal_code, town = (ModifiedData.WRONG_FORMAT.value,) * 2
except TypeError as error:
logging.debug(f"{str(error)}: {str(address_xml_item)}")
postal_code, town = (ModifiedData.WRONG_FORMAT.value,) * 2
except AttributeError as error:
try:
logging.debug(f"{str(error)}: {str(address_xml_item)}")
regex_match = RE_TOWN.match(address_xml_item)
town = regex_match.group(1).upper()
postal_code = ModifiedData.WRONG_FORMAT.value
except AttributeError as error:
try:
logging.debug(f"{str(error)}: {str(address_xml_item)}")
regex_match = RE_POSTAL_CODE.match(address_xml_item)
town = ModifiedData.WRONG_FORMAT.value
postal_code = regex_match.group(1)
except AttributeError as error:
logging.debug(f"{str(error)}: {str(address_xml_item)}")
postal_code, town = (ModifiedData.WRONG_FORMAT.value,) * 2

return postal_code, town
town = regex_match.group(2).upper().strip()

if town:
return postal_code, town
return (ModifiedData.WRONG_FORMAT.value,) * 2


def _parse_bilan(soup: BeautifulSoup, type_bilan: str) -> dict:
Expand All @@ -122,7 +101,7 @@ def _parse_bilan(soup: BeautifulSoup, type_bilan: str) -> dict:

fields = (
ontology.read_account()
.pipe(lambda df: df[df["accountability_code"] == type_bilan])
.pipe(lambda df: df[df["accountability"] == type_bilan])
.to_dict(orient="records")
)

Expand All @@ -132,9 +111,9 @@ def _parse_bilan(soup: BeautifulSoup, type_bilan: str) -> dict:


def _get_field_amount(bilan: BeautifulSoup, field: dict) -> dict:
element = bilan.find("liasse", {"code": field["bundle_name"]})
element = bilan.find("liasse", {"code": field["code"]})
if not element:
return field["bundle_name"], None
return field["code"], None
column = field["column"]
amount = element.get(f"m{column}")
return field["bundle_name"], amount if amount is None else int(amount)
return field["code"], amount if amount is None else int(amount)
2 changes: 1 addition & 1 deletion tests/scraping/test_liasse.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,4 @@ def test_read_adresse_with_badly_formatted_string(self):
inp = "SASU PATRICE MAZET"
postcode, town = read_address_data(inp)
print(postcode, town)
assert (postcode, town) == (ModifiedData.WRONG_FORMAT.value, inp)
assert (postcode, town) == (ModifiedData.WRONG_FORMAT.value,) * 2