diff --git a/ckanext/datapusher_plus/jobs.py b/ckanext/datapusher_plus/jobs.py index 7d2781a..c7bdca1 100644 --- a/ckanext/datapusher_plus/jobs.py +++ b/ckanext/datapusher_plus/jobs.py @@ -1,1625 +1,31 @@ # -*- coding: utf-8 -*- -# flake8: noqa: E501 - -# Standard library imports -import csv -import hashlib -import locale -import mimetypes -import os -import subprocess -import tempfile -import time -from urllib.parse import urlsplit, urlparse -import logging -import uuid -import sys -import json -import requests -from pathlib import Path -from typing import Dict, Any, Optional, List - -# Third-party imports -import psycopg2 -from psycopg2 import sql -from datasize import DataSize -from dateutil.parser import parse as parsedate -import traceback -import sqlalchemy as sa -from rq import get_current_job - -import ckanext.datapusher_plus.utils as utils -import ckanext.datapusher_plus.helpers as dph -import ckanext.datapusher_plus.jinja2_helpers as j2h -from ckanext.datapusher_plus.job_exceptions import HTTPError -import ckanext.datapusher_plus.config as conf -import ckanext.datapusher_plus.spatial_helpers as sh -import ckanext.datapusher_plus.datastore_utils as dsu -from ckanext.datapusher_plus.logging_utils import TRACE -from ckanext.datapusher_plus.qsv_utils import QSVCommand -from ckanext.datapusher_plus.pii_screening import screen_for_pii - -if locale.getdefaultlocale()[0]: - lang, encoding = locale.getdefaultlocale() - locale.setlocale(locale.LC_ALL, locale=(lang, encoding)) -else: - locale.setlocale(locale.LC_ALL, "") - - -def validate_input(input: Dict[str, Any]) -> None: - # Especially validate metadata which is provided by the user - if "metadata" not in input: - raise utils.JobError("Metadata missing") - - data = input["metadata"] - - if "resource_id" not in data: - raise utils.JobError("No id provided.") - - -def callback_datapusher_hook(result_url: str, job_dict: Dict[str, Any]) -> bool: - api_token = utils.get_dp_plus_user_apitoken() - headers: Dict[str, str] = { - "Content-Type": "application/json", - "Authorization": api_token, - } - - try: - result = requests.post( - result_url, - data=json.dumps(job_dict, cls=utils.DatetimeJsonEncoder), - verify=conf.SSL_VERIFY, - headers=headers, - ) - except requests.ConnectionError: - return False - - return result.status_code == requests.codes.ok - - -def datapusher_plus_to_datastore(input: Dict[str, Any]) -> Optional[str]: - """ - This is the main function that is called by the datapusher_plus worker - - Errors are caught and logged in the database - - Args: - input: Dictionary containing metadata and other job information - - Returns: - Optional[str]: Returns "error" if there was an error, None otherwise - """ - job_dict: Dict[str, Any] = dict(metadata=input["metadata"], status="running") - callback_datapusher_hook(result_url=input["result_url"], job_dict=job_dict) - - job_id = get_current_job().id - errored = False - try: - push_to_datastore(input, job_id) - job_dict["status"] = "complete" - dph.mark_job_as_completed(job_id, job_dict) - except utils.JobError as e: - dph.mark_job_as_errored(job_id, str(e)) - job_dict["status"] = "error" - job_dict["error"] = str(e) - log = logging.getLogger(__name__) - log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}") - errored = True - except Exception as e: - dph.mark_job_as_errored( - job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e) - ) - job_dict["status"] = "error" - job_dict["error"] = str(e) - log = logging.getLogger(__name__) - log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}") - errored = True - finally: - # job_dict is defined in datapusher_hook's docstring - is_saved_ok = callback_datapusher_hook( - result_url=input["result_url"], job_dict=job_dict - ) - errored = errored or not is_saved_ok - return "error" if errored else None - - -def push_to_datastore( - input: Dict[str, Any], task_id: str, dry_run: bool = False -) -> Optional[List[Dict[str, Any]]]: - """Download and parse a resource push its data into CKAN's DataStore. - - An asynchronous job that gets a resource from CKAN, downloads the - resource's data file and, if the data file has changed since last time, - parses the data and posts it into CKAN's DataStore. - - Args: - input: Dictionary containing metadata and other job information - task_id: Unique identifier for the task - dry_run: If True, fetch and parse the data file but don't actually post the - data to the DataStore, instead return the data headers and rows that - would have been posted. - - Returns: - Optional[List[Dict[str, Any]]]: If dry_run is True, returns the headers and rows - that would have been posted. Otherwise returns None. - """ - # Ensure temporary files are removed after run - with tempfile.TemporaryDirectory() as temp_dir: - return _push_to_datastore(task_id, input, dry_run=dry_run, temp_dir=temp_dir) - - -def _push_to_datastore( - task_id: str, - input: Dict[str, Any], - dry_run: bool = False, - temp_dir: Optional[str] = None, -) -> Optional[List[Dict[str, Any]]]: - # add job to dn (datapusher_plus_jobs table) - try: - dph.add_pending_job(task_id, **input) - except sa.exc.IntegrityError: - raise utils.JobError("Job already exists.") - handler = utils.StoringHandler(task_id, input) - logger = logging.getLogger(task_id) - logger.addHandler(handler) - - # also show logs on stderr - logger.addHandler(logging.StreamHandler()) - - # set the log level to the config upload_log_level - try: - log_level = getattr(logging, conf.UPLOAD_LOG_LEVEL.upper()) - except AttributeError: - # fallback to our custom TRACE level - log_level = TRACE - - # set the log level to the config upload_log_level - logger.setLevel(logging.INFO) - logger.info(f"Setting log level to {logging.getLevelName(int(log_level))}") - logger.setLevel(log_level) - - # check if conf.QSV_BIN exists - if not Path(conf.QSV_BIN).is_file(): - raise utils.JobError(f"{conf.QSV_BIN} not found.") - - # Initialize QSVCommand - qsv = QSVCommand(logger=logger) - - validate_input(input) - - data = input["metadata"] - - ckan_url = data["ckan_url"] - resource_id = data["resource_id"] - try: - resource = dsu.get_resource(resource_id) - except utils.JobError: - # try again in 5 seconds just incase CKAN is slow at adding resource - time.sleep(5) - resource = dsu.get_resource(resource_id) - - # check if the resource url_type is a datastore - if resource.get("url_type") == "datastore": - logger.info("Dump files are managed with the Datastore API") - return - - # check scheme - resource_url = resource.get("url") - scheme = urlsplit(resource_url).scheme - if scheme not in ("http", "https", "ftp"): - raise utils.JobError("Only http, https, and ftp resources may be fetched.") - - # ========================================================================== - # DOWNLOAD - # ========================================================================== - timer_start = time.perf_counter() - dataset_stats = {} - - # fetch the resource data - logger.info(f"Fetching from: {resource_url}...") - headers: Dict[str, str] = {} - if resource.get("url_type") == "upload": - # If this is an uploaded file to CKAN, authenticate the request, - # otherwise we won't get file from private resources - api_token = utils.get_dp_plus_user_apitoken() - headers["Authorization"] = api_token - - # If the ckan_url differs from this url, rewrite this url to the ckan - # url. This can be useful if ckan is behind a firewall. - if not resource_url.startswith(ckan_url): - new_url = urlparse(resource_url) - rewrite_url = urlparse(ckan_url) - new_url = new_url._replace( - scheme=rewrite_url.scheme, netloc=rewrite_url.netloc - ) - resource_url = new_url.geturl() - logger.info(f"Rewritten resource url to: {resource_url}") - - try: - kwargs: Dict[str, Any] = { - "headers": headers, - "timeout": conf.TIMEOUT, - "verify": conf.SSL_VERIFY, - "stream": True, - } - if conf.USE_PROXY: - kwargs["proxies"] = { - "http": conf.DOWNLOAD_PROXY, - "https": conf.DOWNLOAD_PROXY, - } - with requests.get(resource_url, **kwargs) as response: - response.raise_for_status() - - cl = response.headers.get("content-length") - max_content_length = conf.MAX_CONTENT_LENGTH - ct = response.headers.get("content-type") - - try: - if cl and int(cl) > max_content_length and conf.PREVIEW_ROWS > 0: - raise utils.JobError( - f"Resource too large to download: {DataSize(int(cl)):.2MB} > max ({DataSize(int(max_content_length)):.2MB})." - ) - except ValueError: - pass - - resource_format = resource.get("format").upper() - - # if format was not specified, try to get it from mime type - if not resource_format: - logger.info("File format: NOT SPECIFIED") - # if we have a mime type, get the file extension from the response header - if ct: - resource_format = mimetypes.guess_extension(ct.split(";")[0]) - - if resource_format is None: - raise utils.JobError( - "Cannot determine format from mime type. Please specify format." - ) - logger.info(f"Inferred file format: {resource_format}") - else: - raise utils.JobError( - "Server did not return content-type. Please specify format." - ) - else: - logger.info(f"File format: {resource_format}") - - tmp = os.path.join(temp_dir, "tmp." + resource_format) - length = 0 - # using MD5 for file deduplication only - # no need for it to be cryptographically secure - m = hashlib.md5() # DevSkim: ignore DS126858 - - # download the file - if cl: - logger.info(f"Downloading {DataSize(int(cl)):.2MB} file...") - else: - logger.info("Downloading file of unknown size...") - - with open(tmp, "wb") as tmp_file: - for chunk in response.iter_content(conf.CHUNK_SIZE): - length += len(chunk) - if length > max_content_length and not conf.PREVIEW_ROWS: - raise utils.JobError( - f"Resource too large to process: {length} > max ({max_content_length})." - ) - tmp_file.write(chunk) - m.update(chunk) - - except requests.HTTPError as e: - raise HTTPError( - f"DataPusher+ received a bad HTTP response when trying to download " - f"the data file from {resource_url}. Status code: {e.response.status_code}, " - f"Response content: {e.response.content}", - status_code=e.response.status_code, - request_url=resource_url, - response=e.response.content, - ) - except requests.RequestException as e: - raise HTTPError( - message=str(e), - status_code=None, - request_url=resource_url, - response=None, - ) - - file_hash = m.hexdigest() - dataset_stats["ORIGINAL_FILE_SIZE"] = length - - # check if the resource metadata (like data dictionary data types) - # has been updated since the last fetch - resource_updated = False - resource_last_modified = resource.get("last_modified") - if resource_last_modified: - resource_last_modified = parsedate(resource_last_modified) - file_last_modified = response.headers.get("last-modified") - if file_last_modified: - file_last_modified = parsedate(file_last_modified).replace(tzinfo=None) - if file_last_modified < resource_last_modified: - resource_updated = True - - if ( - resource.get("hash") == file_hash - and not data.get("ignore_hash") - and not conf.IGNORE_FILE_HASH - and not resource_updated - ): - logger.warning(f"Upload skipped as the file hash hasn't changed: {file_hash}.") - return - - resource["hash"] = file_hash - - fetch_elapsed = time.perf_counter() - timer_start - logger.info( - f"Fetched {DataSize(length):.2MB} file in {fetch_elapsed:,.2f} seconds." - ) - - # Check if the file is a zip file - unzipped_format = "" - if resource_format.upper() == "ZIP": - logger.info("Processing ZIP file...") - - file_count, extracted_path, unzipped_format = dph.extract_zip_or_metadata( - tmp, temp_dir, logger - ) - if not file_count: - logger.error("ZIP file invalid or no files found in ZIP file.") - return - logger.info( - f"More than one file in the ZIP file ({file_count} files), saving metadata..." - if file_count > 1 - else f"Extracted {unzipped_format} file: {extracted_path}" - ) - tmp = extracted_path - - # =================================================================================== - # ANALYZE WITH QSV - # =================================================================================== - # Start Analysis using qsv instead of messytables, as - # 1) its type inferences are bullet-proof not guesses as it scans the entire file, - # 2) its super-fast, and - # 3) it has addl data-wrangling capabilities we use in DP+ (e.g. stats, dedup, etc.) - dupe_count = 0 - record_count = 0 - analysis_start = time.perf_counter() - logger.info("ANALYZING WITH QSV..") - - # flag to check if the file is a spatial format - spatial_format_flag = False - simplification_failed_flag = False - # ----------------- is it a spreadsheet? --------------- - # check content type or file extension if its a spreadsheet - spreadsheet_extensions = ["XLS", "XLSX", "ODS", "XLSM", "XLSB"] - file_format = resource.get("format").upper() - if ( - file_format in spreadsheet_extensions - or unzipped_format in spreadsheet_extensions - ): - # if so, export spreadsheet as a CSV file - default_excel_sheet = conf.DEFAULT_EXCEL_SHEET - file_format = unzipped_format if unzipped_format != "" else file_format - logger.info(f"Converting {file_format} sheet {default_excel_sheet} to CSV...") - # first, we need a temporary spreadsheet filename with the right file extension - # we only need the filename though, that's why we remove it - # and create a hardlink to the file we got from CKAN - qsv_spreadsheet = os.path.join(temp_dir, "qsv_spreadsheet." + file_format) - os.link(tmp, qsv_spreadsheet) - - # run `qsv excel` and export it to a CSV - # use --trim option to trim column names and the data - qsv_excel_csv = os.path.join(temp_dir, "qsv_excel.csv") - try: - qsv_excel = qsv.excel( - qsv_spreadsheet, - sheet=default_excel_sheet, - trim=True, - output_file=qsv_excel_csv, - ) - except utils.JobError as e: - raise utils.JobError( - f"Upload aborted. Cannot export spreadsheet(?) to CSV: {e}" - ) - excel_export_msg = qsv_excel.stderr - logger.info(f"{excel_export_msg}...") - tmp = qsv_excel_csv - elif resource_format.upper() in ["SHP", "QGIS", "GEOJSON"]: - logger.info("SHAPEFILE or GEOJSON file detected...") - - qsv_spatial_file = os.path.join( - temp_dir, - "qsv_spatial_" + str(uuid.uuid4()) + "." + resource_format, - ) - os.link(tmp, qsv_spatial_file) - qsv_spatial_csv = os.path.join(temp_dir, "qsv_spatial.csv") - - if conf.AUTO_SPATIAL_SIMPLIFICATION: - # Try to convert spatial file to CSV using spatial_helpers - logger.info( - f"Converting spatial file to CSV with a simplification relative tolerance of {conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE}..." - ) - - try: - # Use the convert_to_csv function from spatial_helpers - success, error_message, bounds = sh.process_spatial_file( - qsv_spatial_file, - resource_format, - output_csv_path=qsv_spatial_csv, - tolerance=conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE, - task_logger=logger, - ) - - if success: - logger.info( - "Spatial file successfully simplified and converted to CSV" - ) - tmp = qsv_spatial_csv - - # Check if the simplified resource already exists - simplified_resource_name = ( - os.path.splitext(resource["name"])[0] - + "_simplified" - + os.path.splitext(resource["name"])[1] - ) - existing_resource, existing_resource_id = dsu.resource_exists( - resource["package_id"], simplified_resource_name - ) - - if existing_resource: - logger.info( - "Simplified resource already exists. Replacing it..." - ) - dsu.delete_resource(existing_resource_id) - else: - logger.info( - "Simplified resource does not exist. Uploading it..." - ) - new_simplified_resource = { - "package_id": resource["package_id"], - "name": os.path.splitext(resource["name"])[0] - + "_simplified" - + os.path.splitext(resource["name"])[1], - "url": "", - "format": resource["format"], - "hash": "", - "mimetype": resource["mimetype"], - "mimetype_inner": resource["mimetype_inner"], - } - - # Add bounds information if available - if bounds: - minx, miny, maxx, maxy = bounds - new_simplified_resource.update( - { - "dpp_spatial_extent": { - "type": "BoundingBox", - "coordinates": [ - [minx, miny], - [maxx, maxy], - ], - } - } - ) - logger.info( - f"Added dpp_spatial_extent to resource metadata: {bounds}" - ) - - dsu.upload_resource(new_simplified_resource, qsv_spatial_file) - - # delete the simplified spatial file - os.remove(qsv_spatial_file) - - simplification_failed_flag = False - else: - logger.warning( - f"Upload of simplified spatial file failed: {error_message}" - ) - simplification_failed_flag = True - except Exception as e: - logger.warning(f"Simplification and conversion failed: {str(e)}") - logger.warning( - f"Simplification and conversion failed. Using qsv geoconvert to convert to CSV, truncating large columns to {conf.QSV_STATS_STRING_MAX_LENGTH} characters..." - ) - simplification_failed_flag = True - pass - - # If we are not auto-simplifying or simplification failed, use qsv geoconvert - if not conf.AUTO_SPATIAL_SIMPLIFICATION or simplification_failed_flag: - logger.info("Converting spatial file to CSV using qsv geoconvert...") - - # Run qsv geoconvert - qsv_geoconvert_csv = os.path.join(temp_dir, "qsv_geoconvert.csv") - try: - qsv.geoconvert( - tmp, - resource_format, - "csv", - max_length=conf.QSV_STATS_STRING_MAX_LENGTH, - output_file=qsv_geoconvert_csv, - ) - except utils.JobError as e: - raise utils.JobError(f"qsv geoconvert failed: {e}") - - tmp = qsv_geoconvert_csv - logger.info("Geoconverted successfully") - - else: - # --- its not a spreadsheet nor a spatial format, its a CSV/TSV/TAB file ------ - # Normalize & transcode to UTF-8 using `qsv input`. We need to normalize as - # it could be a CSV/TSV/TAB dialect with differing delimiters, quoting, etc. - # Using qsv input's --output option also auto-transcodes to UTF-8. - # Note that we only change the workfile, the resource file itself is unchanged. - - # ------------------- Normalize to CSV --------------------- - qsv_input_csv = os.path.join(temp_dir, "qsv_input.csv") - # if resource_format is CSV we don't need to normalize - if resource_format.upper() == "CSV": - logger.info(f"Normalizing/UTF-8 transcoding {resource_format}...") - else: - # if not CSV (e.g. TSV, TAB, etc.) we need to normalize to CSV - logger.info(f"Normalizing/UTF-8 transcoding {resource_format} to CSV...") - - qsv_input_utf_8_encoded_csv = os.path.join( - temp_dir, "qsv_input_utf_8_encoded.csv" - ) - - # using uchardet to determine encoding - file_encoding = subprocess.run( - ["uchardet", tmp], - check=True, - capture_output=True, - text=True, - ) - logger.info(f"Identified encoding of the file: {file_encoding.stdout}") - - # trim the encoding string - file_encoding.stdout = file_encoding.stdout.strip() - - # using iconv to re-encode in UTF-8 OR ASCII (as ASCII is a subset of UTF-8) - if file_encoding.stdout != "UTF-8" and file_encoding.stdout != "ASCII": - logger.info( - f"File is not UTF-8 encoded. Re-encoding from {file_encoding.stdout} to UTF-8" - ) - try: - cmd = subprocess.run( - [ - "iconv", - "-f", - file_encoding.stdout, - "-t", - "UTF-8", - tmp, - ], - capture_output=True, - check=True, - ) - except subprocess.CalledProcessError as e: - raise utils.JobError( - f"Job aborted as the file cannot be re-encoded to UTF-8. {e.stderr}" - ) - f = open(qsv_input_utf_8_encoded_csv, "wb") - f.write(cmd.stdout) - f.close() - logger.info("Successfully re-encoded to UTF-8") - - else: - qsv_input_utf_8_encoded_csv = tmp - try: - qsv.input(tmp, trim_headers=True, output_file=qsv_input_csv) - except utils.JobError as e: - raise utils.JobError( - f"Job aborted as the file cannot be normalized/transcoded: {e}." - ) - tmp = qsv_input_csv - logger.info("Normalized & transcoded...") - - # ------------------------------------- Validate CSV -------------------------------------- - # Run an RFC4180 check with `qsv validate` against the normalized, UTF-8 encoded CSV file. - # Even excel exported CSVs can be potentially invalid, as it allows the export of "flexible" - # CSVs - i.e. rows may have different column counts. - # If it passes validation, we can handle it with confidence downstream as a "normal" CSV. - logger.info("Validating CSV...") - try: - qsv.validate(tmp) - except utils.JobError as e: - raise utils.JobError(f"qsv validate failed: {e}") - - logger.info("Well-formed, valid CSV file confirmed...") - - # --------------------- Sortcheck -------------------------- - # if SORT_AND_DUPE_CHECK is True or DEDUP is True - # check if the file is sorted and if it has duplicates - # get the record count, unsorted breaks and duplicate count as well - if conf.SORT_AND_DUPE_CHECK or conf.DEDUP: - logger.info("Checking for duplicates and if the CSV is sorted...") - - try: - qsv_sortcheck = qsv.sortcheck(tmp, json_output=True, uses_stdio=True) - except utils.JobError as e: - raise utils.JobError( - f"Failed to check if CSV is sorted and has duplicates: {e}" - ) - - try: - # Handle both subprocess.CompletedProcess and dict outputs - stdout_content = ( - qsv_sortcheck.stdout - if hasattr(qsv_sortcheck, "stdout") - else qsv_sortcheck.get("stdout") - ) - sortcheck_json = json.loads(str(stdout_content)) - except (json.JSONDecodeError, AttributeError) as e: - raise utils.JobError(f"Failed to parse sortcheck JSONoutput: {e}") - - try: - # Extract and validate required fields - is_sorted = bool(sortcheck_json.get("sorted", False)) - record_count = int(sortcheck_json.get("record_count", 0)) - unsorted_breaks = int(sortcheck_json.get("unsorted_breaks", 0)) - dupe_count = int(sortcheck_json.get("dupe_count", 0)) - dataset_stats["IS_SORTED"] = is_sorted - dataset_stats["RECORD_COUNT"] = record_count - dataset_stats["UNSORTED_BREAKS"] = unsorted_breaks - dataset_stats["DUPE_COUNT"] = dupe_count - except (ValueError, TypeError) as e: - raise utils.JobError(f"Invalid numeric value in sortcheck output: {e}") - - # Format the message with clear statistics - sortcheck_msg = f"Sorted: {is_sorted}; Unsorted breaks: {unsorted_breaks:,}" - if is_sorted and dupe_count > 0: - sortcheck_msg = f"{sortcheck_msg}; Duplicates: {dupe_count:,}" - - logger.info(sortcheck_msg) - - # --------------- Do we need to dedup? ------------------ - if conf.DEDUP and dupe_count > 0: - qsv_dedup_csv = os.path.join(temp_dir, "qsv_dedup.csv") - logger.info(f"{dupe_count} duplicate rows found. Deduping...") - - try: - qsv.extdedup(tmp, qsv_dedup_csv) - except utils.JobError as e: - raise utils.JobError(f"Check for duplicates error: {e}") - - dataset_stats["DEDUPED"] = True - tmp = qsv_dedup_csv - logger.info(f"Deduped CSV saved to {qsv_dedup_csv}") - else: - dataset_stats["DEDUPED"] = False - - # ----------------------- Headers & Safenames --------------------------- - # get existing header names, so we can use them for data dictionary labels - # should we need to change the column name to make it "db-safe" - try: - qsv_headers = qsv.headers(tmp, just_names=True) - except utils.JobError as e: - raise utils.JobError(f"Cannot scan CSV headers: {e}") - original_headers = str(qsv_headers.stdout).strip() - original_header_dict = { - idx: ele for idx, ele in enumerate(original_headers.splitlines()) - } - - # now, ensure our column/header names identifiers are "safe names" - # i.e. valid postgres/CKAN Datastore identifiers - qsv_safenames_csv = os.path.join(temp_dir, "qsv_safenames.csv") - logger.info('Checking for "database-safe" header names...') - try: - qsv_safenames = qsv.safenames( - tmp, - mode="json", - reserved=conf.RESERVED_COLNAMES, - prefix=conf.UNSAFE_PREFIX, - uses_stdio=True, - ) - except utils.JobError as e: - raise utils.JobError(f"Cannot scan CSV headers: {e}") - - unsafe_json = json.loads(str(qsv_safenames.stdout)) - unsafe_headers = unsafe_json["unsafe_headers"] - - if unsafe_headers: - logger.info( - f'"{len(unsafe_headers)} unsafe" header names found ({unsafe_headers}). Sanitizing..."' - ) - qsv_safenames = qsv.safenames( - tmp, mode="conditional", output_file=qsv_safenames_csv - ) - tmp = qsv_safenames_csv - else: - logger.info("No unsafe header names found...") - - # ---------------------- Type Inferencing ----------------------- - # at this stage, we have a "clean" CSV ready for Type Inferencing - - # first, index csv for speed - count, stats and slice - # are all accelerated/multithreaded when an index is present - try: - qsv_index_file = tmp + ".idx" - qsv.index(tmp) - except utils.JobError as e: - raise utils.JobError(f"Cannot index CSV: {e}") - - # if SORT_AND_DUPE_CHECK = True, we already know the record count - # so we can skip qsv count. - if not conf.SORT_AND_DUPE_CHECK: - # get record count, this is instantaneous with an index - try: - qsv_count = qsv.count(tmp) - record_count = int(str(qsv_count.stdout).strip()) - dataset_stats["RECORD_COUNT"] = record_count - except utils.JobError as e: - raise utils.JobError(f"Cannot count records in CSV: {e}") - - # its empty, nothing to do - if record_count == 0: - logger.warning("Upload skipped as there are zero records.") - return - - # log how many records we detected - unique_qualifier = "" - if conf.DEDUP: - unique_qualifier = "unique" - logger.info(f"{record_count} {unique_qualifier} records detected...") - - # run qsv stats to get data types and summary statistics - logger.info("Inferring data types and compiling statistics...") - headers = [] - types = [] - headers_min = [] - headers_max = [] - headers_cardinality = [] - qsv_stats_csv = os.path.join(temp_dir, "qsv_stats.csv") - - try: - # If the file is a spatial format, we need to use --max-length - # to truncate overly long strings from causing issues with - # Python's CSV reader and Postgres's limits with the COPY command - if spatial_format_flag: - env = os.environ.copy() - env["QSV_STATS_STRING_MAX_LENGTH"] = str(conf.QSV_STATS_STRING_MAX_LENGTH) - qsv_stats = qsv.stats( - tmp, - infer_dates=True, - dates_whitelist=conf.QSV_DATES_WHITELIST, - stats_jsonl=True, - prefer_dmy=conf.PREFER_DMY, - cardinality=bool(conf.AUTO_INDEX_THRESHOLD), - summary_stats_options=conf.SUMMARY_STATS_OPTIONS, - output_file=qsv_stats_csv, - env=env, - ) - else: - qsv_stats = qsv.stats( - tmp, - infer_dates=True, - dates_whitelist=conf.QSV_DATES_WHITELIST, - stats_jsonl=True, - prefer_dmy=conf.PREFER_DMY, - cardinality=bool(conf.AUTO_INDEX_THRESHOLD), - summary_stats_options=conf.SUMMARY_STATS_OPTIONS, - output_file=qsv_stats_csv, - ) - except utils.JobError as e: - raise utils.JobError(f"Cannot infer data types and compile statistics: {e}") - - # Dictionary to look up stats by resource field name - resource_fields_stats = {} - - with open(qsv_stats_csv, mode="r") as inp: - reader = csv.DictReader(inp) - for row in reader: - # Add to stats dictionary with resource field name as key - resource_fields_stats[row["field"]] = {"stats": row} - - fr = {k: v for k, v in row.items()} - schema_field = fr.get("field", "Unnamed Column") - if schema_field.startswith("qsv_"): - break - headers.append(schema_field) - types.append(fr.get("type", "String")) - headers_min.append(fr["min"]) - headers_max.append(fr["max"]) - if conf.AUTO_INDEX_THRESHOLD: - headers_cardinality.append(int(fr.get("cardinality") or 0)) - - # Get the field stats for each field in the headers list - existing = dsu.datastore_resource_exists(resource_id) - existing_info = None - if existing: - existing_info = dict( - (f["id"], f["info"]) for f in existing.get("fields", []) if "info" in f - ) - - # if this is an existing resource - # override with types user requested in Data Dictionary - if existing_info: - types = [ - { - "text": "String", - "numeric": "Float", - "timestamp": "DateTime", - }.get(existing_info.get(h, {}).get("type_override"), t) - for t, h in zip(types, headers) - ] - - # Delete existing datastore resource before proceeding. - if existing: - logger.info(f'Deleting existing resource "{resource_id}" from datastore.') - dsu.delete_datastore_resource(resource_id) - - # 1st pass of building headers_dict - # here we map inferred types to postgresql data types - default_type = "String" - temp_headers_dicts = [ - dict( - id=field[0], - type=conf.TYPE_MAPPING.get( - str(field[1]) if field[1] else default_type, "text" - ), - ) - for field in zip(headers, types) - ] - - # 2nd pass header_dicts, checking for smartint types. - # "smartint" will automatically select the best integer data type based on the - # min/max values of the column we got from qsv stats. - # We also set the Data Dictionary Label to original column names in case we made - # the names "db-safe" as the labels are used by DataTables_view to label columns - # we also take note of datetime/timestamp fields, so we can normalize them - # to RFC3339 format, which is Postgres COPY ready - datetimecols_list = [] - headers_dicts = [] - for idx, header in enumerate(temp_headers_dicts): - if header["type"] == "smartint": - if ( - int(headers_max[idx]) <= conf.POSTGRES_INT_MAX - and int(headers_min[idx]) >= conf.POSTGRES_INT_MIN - ): - header_type = "integer" - elif ( - int(headers_max[idx]) <= conf.POSTGRES_BIGINT_MAX - and int(headers_min[idx]) >= conf.POSTGRES_BIGINT_MIN - ): - header_type = "bigint" - else: - header_type = "numeric" - else: - header_type = header["type"] - if header_type == "timestamp": - datetimecols_list.append(header["id"]) - info_dict = dict(label=original_header_dict.get(idx, "Unnamed Column")) - headers_dicts.append(dict(id=header["id"], type=header_type, info=info_dict)) - - # Maintain data dictionaries from matching column names - # if data dictionary already exists for this resource as - # we want to preserve the user's data dictionary curations - if existing_info: - for h in headers_dicts: - if h["id"] in existing_info: - h["info"] = existing_info[h["id"]] - # create columns with types user requested - type_override = existing_info[h["id"]].get("type_override") - if type_override in list(conf.TYPE_MAPPING.values()): - h["type"] = type_override - - logger.info(f"Determined headers and types: {headers_dicts}...") - - # ----------------------- Frequency Table --------------------------- - # compile a frequency table for each column - qsv_freq_csv = os.path.join(temp_dir, "qsv_freq.csv") - - try: - qsv.frequency(tmp, limit=conf.QSV_FREQ_LIMIT, output_file=qsv_freq_csv) - except utils.JobError as e: - raise utils.JobError(f"Cannot create a frequency table: {e}") - - resource_fields_freqs = {} - try: - with open(qsv_freq_csv, "r") as f: - reader = csv.DictReader(f) - for row in reader: - field = row["field"] - value = row["value"] - count = row["count"] - percentage = row["percentage"] - - # Initialize list for field if it doesn't exist - if field not in resource_fields_freqs: - resource_fields_freqs[field] = [] - - # Append the frequency data as a dict to the field's list - resource_fields_freqs[field].append( - { - "value": value, - "count": count, - "percentage": percentage, - } - ) - - logger.trace(f"Resource fields freqs: {resource_fields_freqs}") - - except IOError as e: - raise utils.JobError("Could not open frequency CSV file: {}".format(e)) - - # ------------------- Do we need to create a Preview? ----------------------- - # if conf.PREVIEW_ROWS is not zero, create a preview using qsv slice - # we do the rows_to_copy > conf.PREVIEW_ROWS to check if we don't need to slice - # the CSV anymore if we only did a partial download of N conf.PREVIEW_ROWS already - rows_to_copy = record_count - if conf.PREVIEW_ROWS and record_count > conf.PREVIEW_ROWS: - if conf.PREVIEW_ROWS > 0: - # conf.PREVIEW_ROWS is positive, slice from the beginning - logger.info(f"Preparing {conf.PREVIEW_ROWS}-row preview...") - qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv") - try: - qsv.slice(tmp, length=conf.PREVIEW_ROWS, output_file=qsv_slice_csv) - except utils.JobError as e: - raise utils.JobError(f"Cannot create a preview slice: {e}") - rows_to_copy = conf.PREVIEW_ROWS - tmp = qsv_slice_csv - else: - # conf.PREVIEW_ROWS is negative, slice from the end - # TODO: do http range request so we don't have to download the whole file - # to slice from the end - slice_len = abs(conf.PREVIEW_ROWS) - logger.info(f"Preparing {slice_len}-row preview from the end...") - qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv") - try: - qsv.slice(tmp, start=-1, length=slice_len, output_file=qsv_slice_csv) - except utils.JobError as e: - raise utils.JobError(f"Cannot create a preview slice from the end: {e}") - rows_to_copy = slice_len - tmp = qsv_slice_csv - - dataset_stats["PREVIEW_FILE_SIZE"] = os.path.getsize(tmp) - dataset_stats["PREVIEW_RECORD_COUNT"] = rows_to_copy - - # ---------------- Normalize dates to RFC3339 format -------------------- - # if there are any datetime fields, normalize them to RFC3339 format - # so we can readily insert them as timestamps into postgresql with COPY - if datetimecols_list: - qsv_applydp_csv = os.path.join(temp_dir, "qsv_applydp.csv") - datecols = ",".join(datetimecols_list) - - logger.info( - f'Formatting dates "{datecols}" to ISO 8601/RFC 3339 format with PREFER_DMY: {conf.PREFER_DMY}...' - ) - try: - qsv.datefmt( - datecols, - tmp, - prefer_dmy=conf.PREFER_DMY, - output_file=qsv_applydp_csv, - ) - except utils.JobError as e: - raise utils.JobError(f"Applydp error: {e}") - tmp = qsv_applydp_csv - - # -------------------- QSV ANALYSIS DONE -------------------- - analysis_elapsed = time.perf_counter() - analysis_start - logger.info( - f"ANALYSIS DONE! Analyzed and prepped in {analysis_elapsed:,.2f} seconds." - ) - - # ----------------------------- PII Screening ------------------------------ - # we scan for Personally Identifiable Information (PII) using qsv's powerful - # searchset command which can SIMULTANEOUSLY compare several regexes per - # field in one pass - piiscreening_start = 0 - piiscreening_elapsed = 0 - pii_found = False - - if conf.PII_SCREENING: - piiscreening_start = time.perf_counter() - pii_found = screen_for_pii(tmp, resource, qsv, temp_dir, logger) - piiscreening_elapsed = time.perf_counter() - piiscreening_start - - dataset_stats["PII_SCREENING"] = conf.PII_SCREENING - dataset_stats["PII_FOUND"] = pii_found - - # delete the qsv index file manually - # as it was created by qsv index, and not by tempfile - os.remove(qsv_index_file) - - # at this stage, the resource is ready for COPYing to the Datastore - - if dry_run: - logger.warning("Dry run only. Returning without copying to the Datastore...") - return headers_dicts - - # ============================================================ - # COPY to Datastore - # ============================================================ - copy_start = time.perf_counter() - - if conf.PREVIEW_ROWS: - logger.info(f"COPYING {rows_to_copy}-row preview to Datastore...") - else: - logger.info(f"COPYING {rows_to_copy} rows to Datastore...") - - # first, let's create an empty datastore table w/ guessed types - dsu.send_resource_to_datastore( - resource=None, - resource_id=resource["id"], - headers=headers_dicts, - records=None, - aliases=None, - calculate_record_count=False, - ) - - copied_count = 0 - try: - raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL) - except psycopg2.Error as e: - raise utils.JobError(f"Could not connect to the Datastore: {e}") - else: - cur = raw_connection.cursor() - - # truncate table in case we're loading over an existing resource - try: - cur.execute( - sql.SQL("TRUNCATE TABLE {}").format(sql.Identifier(resource_id)) - ) - # commit to ensure that the AccessExclusive lock is only held for the - # duration of the truncate, otherwise no other access to the table is - # allowed, blocking all selects. - raw_connection.commit() - except psycopg2.Error as e: - logger.warning(f"Could not TRUNCATE: {e}") - - col_names_list = [h["id"] for h in headers_dicts] - column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list) - copy_sql = sql.SQL( - "COPY {} ({}) FROM STDIN " - "WITH (FORMAT CSV, " - "HEADER 1, ENCODING 'UTF8');" - ).format( - sql.Identifier(resource_id), - column_names, - ) - # specify a 1MB buffer size for COPY read from disk - with open(tmp, "rb", conf.COPY_READBUFFER_SIZE) as f: - try: - cur.copy_expert(copy_sql, f, size=conf.COPY_READBUFFER_SIZE) - except psycopg2.Error as e: - raise utils.JobError(f"Postgres COPY failed: {e}") - else: - copied_count = cur.rowcount - - raw_connection.commit() - # this is needed to issue a VACUUM ANALYZE - raw_connection.set_isolation_level( - psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT - ) - analyze_cur = raw_connection.cursor() - analyze_cur.execute( - sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id)) - ) - analyze_cur.close() - - copy_elapsed = time.perf_counter() - copy_start - logger.info( - f'...copying done. Copied {copied_count} rows to "{resource_id}" in {copy_elapsed:,.2f} seconds.' - ) - - # ================================================================================================= - # INDEXING - # ================================================================================================= - # if AUTO_INDEX_THRESHOLD > 0 or AUTO_INDEX_DATES is true - # create indices automatically based on summary statistics - # For columns w/ cardinality = record_count, it's all unique values, create a unique index - # If AUTO_INDEX_DATES is true, index all date columns - # if a column's cardinality <= AUTO_INDEX_THRESHOLD, create an index for that column - if ( - conf.AUTO_INDEX_THRESHOLD - or (conf.AUTO_INDEX_DATES and datetimecols_list) - or conf.AUTO_UNIQUE_INDEX - ): - index_start = time.perf_counter() - logger.info( - f"AUTO-INDEXING. Auto-index threshold: {conf.AUTO_INDEX_THRESHOLD} unique value/s. Auto-unique index: {conf.AUTO_UNIQUE_INDEX} Auto-index dates: {conf.AUTO_INDEX_DATES} ..." - ) - index_cur = raw_connection.cursor() - - # if auto_index_threshold == -1 - # we index all the columns - if conf.AUTO_INDEX_THRESHOLD == -1: - conf.AUTO_INDEX_THRESHOLD = record_count - - index_count = 0 - for idx, cardinality in enumerate(headers_cardinality): - curr_col = headers[idx] - if ( - conf.AUTO_INDEX_THRESHOLD > 0 - or conf.AUTO_INDEX_DATES - or conf.AUTO_UNIQUE_INDEX - ): - if cardinality == record_count and conf.AUTO_UNIQUE_INDEX: - # all the values are unique for this column, create a unique index - if conf.PREVIEW_ROWS > 0: - unique_value_count = min(conf.PREVIEW_ROWS, cardinality) - else: - unique_value_count = cardinality - logger.info( - f'Creating UNIQUE index on "{curr_col}" for {unique_value_count} unique values...' - ) - try: - index_cur.execute( - sql.SQL("CREATE UNIQUE INDEX ON {} ({})").format( - sql.Identifier(resource_id), - sql.Identifier(curr_col), - ) - ) - except psycopg2.Error as e: - logger.warning( - f'Could not CREATE UNIQUE INDEX on "{curr_col}": {e}' - ) - index_count += 1 - elif cardinality <= conf.AUTO_INDEX_THRESHOLD or ( - conf.AUTO_INDEX_DATES and (curr_col in datetimecols_list) - ): - # cardinality <= auto_index_threshold or its a date and auto_index_date is true - # create an index - if curr_col in datetimecols_list: - logger.info( - f'Creating index on "{curr_col}" date column for {cardinality} unique value/s...' - ) - else: - logger.info( - f'Creating index on "{curr_col}" for {cardinality} unique value/s...' - ) - try: - index_cur.execute( - sql.SQL("CREATE INDEX ON {} ({})").format( - sql.Identifier(resource_id), - sql.Identifier(curr_col), - ) - ) - except psycopg2.Error as e: - logger.warning(f'Could not CREATE INDEX on "{curr_col}": {e}') - index_count += 1 - - index_cur.close() - raw_connection.commit() - - logger.info("Vacuum Analyzing table to optimize indices...") - - # this is needed to issue a VACUUM ANALYZE - raw_connection.set_isolation_level( - psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT - ) - analyze_cur = raw_connection.cursor() - analyze_cur.execute( - sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id)) - ) - analyze_cur.close() - - index_elapsed = time.perf_counter() - index_start - logger.info( - f'...indexing/vacuum analysis done. Indexed {index_count} column/s in "{resource_id}" in {index_elapsed:,.2f} seconds.' - ) - - # ============================================================ - # PROCESS DRUF JINJA2 FORMULAE - # ============================================================ - # Check if there are any fields with DRUF keys in the scheming_yaml - # There are two types of DRUF keys: - # 1. "formula": This is used to update the field value DIRECTLY - # when the resource is created/updated. It can update both package and resource fields. - # 2. "suggestion_formula": This is used to populate the suggestion - # popovers DURING data entry/curation. - # DRUF keys are stored as jinja2 template expressions in the scheming_yaml - # and are rendered using the Jinja2 template engine. - formulae_start = time.perf_counter() - - # Fetch the scheming_yaml and package - package_id = resource["package_id"] - scheming_yaml, package = dsu.get_scheming_yaml( - package_id, scheming_yaml_type="dataset" - ) - - # Check for suggestion_formula in dataset_fields - has_suggestion_formula = any( - isinstance(field, dict) - and any(key.startswith("suggestion_formula") for key in field.keys()) - for field in scheming_yaml["dataset_fields"] - ) - - if has_suggestion_formula: - - logger.info( - 'Found suggestion formulae in schema' - ) - - # Check for "dpp_suggestions" in scheming_yaml - schema_has_dpp_suggestions = any( - isinstance(field, dict) - and field.get("field_name") == "dpp_suggestions" - for field in scheming_yaml["dataset_fields"] - ) - if not schema_has_dpp_suggestions: - logger.error( - '"dpp_suggestions" field required but not found in your schema. Ensure that your scheming.yaml file contains the "dpp_suggestions" field as a json_object.' - ) - return - else: - logger.info( - 'Found "dpp_suggestions" field in schema' - ) - - # add "dpp_suggestions" to package if it does not exist - if "dpp_suggestions" not in package: - - logger.warning( - 'Warning: "dpp_suggestions" field required to process Suggestion Formulae is not found in this package. Adding "dpp_suggestions" to package' - ) - - try: - package["dpp_suggestions"] = {} - dsu.patch_package(package) - logger.warning( - '"dpp_suggestions" field added to package' - ) - - except Exception as e: - logger.error( - f'Error adding "dpp_suggestions" field {e}' - ) - return - else: - logger.info( - 'No suggestion formulae found' - ) - - logger.trace(f"package: {package}") - - # FIRST, INITIALIZE THE FORMULA PROCESSOR - formula_processor = j2h.FormulaProcessor( - scheming_yaml, - package, - resource, - resource_fields_stats, - resource_fields_freqs, - dataset_stats, - logger, - ) - - package.setdefault("dpp_suggestions", {})[ - "STATUS" - ] = "STARTING FORMULAE PROCESSING..." - dsu.patch_package(package) - - # Clear all lru_cache before processing formulae - dsu.datastore_search.cache_clear() - dsu.datastore_search_sql.cache_clear() - dsu.datastore_info.cache_clear() - dsu.index_exists.cache_clear() - - # SECOND, WE PROCESS THE FORMULAE THAT UPDATE THE - # PACKAGE AND RESOURCE FIELDS DIRECTLY - # using the package_patch CKAN API so we only update the fields - # with formulae - package_updates = formula_processor.process_formulae( - "package", "dataset_fields", "formula" - ) - if package_updates: - # Update package with formula results - package.update(package_updates) - status_msg = "PACKAGE formulae processed..." - package["dpp_suggestions"]["STATUS"] = status_msg - try: - patched_package = dsu.patch_package(package) - logger.debug(f"Package after patching: {patched_package}") - package = patched_package - logger.info(status_msg) - except Exception as e: - logger.error(f"Error patching package: {str(e)}") - - # Process resource formulae - # as this is a direct update, we update the resource dictionary directly - resource_updates = formula_processor.process_formulae( - "resource", "resource_fields", "formula" - ) - if resource_updates: - # Update resource with formula results - resource.update(resource_updates) - status_msg = "RESOURCE formulae processed..." - if resource.get("dpp_suggestions"): - resource["dpp_suggestions"]["STATUS"] = status_msg - else: - resource["dpp_suggestions"] = {"STATUS": status_msg} - logger.info(status_msg) - - # THIRD, WE PROCESS THE SUGGESTIONS THAT SHOW UP IN THE SUGGESTION POPOVER - # we update the package dpp_suggestions field - # from which the Suggestion popover UI will pick it up - package_suggestions = formula_processor.process_formulae( - "package", "dataset_fields", "suggestion_formula" - ) - if package_suggestions: - logger.trace(f"package_suggestions: {package_suggestions}") - revise_update_content = {"package": package_suggestions} - try: - status_msg = "PACKAGE suggestion formulae processed..." - revise_update_content["STATUS"] = status_msg - revised_package = dsu.revise_package( - package_id, update={"dpp_suggestions": revise_update_content} - ) - logger.trace(f"Package after revising: {revised_package}") - package = revised_package - logger.info(status_msg) - except Exception as e: - logger.error(f"Error revising package: {str(e)}") - - # Process resource suggestion formulae - # Note how we still update the PACKAGE dpp_suggestions field - # and there is NO RESOURCE dpp_suggestions field. - # This is because suggestion formulae are used to populate the - # suggestion popover DURING data entry/curation and suggestion formulae - # may update both package and resource fields. - resource_suggestions = formula_processor.process_formulae( - "resource", "resource_fields", "suggestion_formula" - ) - if resource_suggestions: - logger.trace(f"resource_suggestions: {resource_suggestions}") - resource_name = resource["name"] - revise_update_content = {"resource": {resource_name: resource_suggestions}} - - # Handle existing suggestions - if package.get("dpp_suggestions"): - package["dpp_suggestions"].update(revise_update_content["resource"]) - else: - package["dpp_suggestions"] = revise_update_content["resource"] - - try: - status_msg = "RESOURCE suggestion formulae processed..." - revise_update_content["STATUS"] = status_msg - - revised_package = dsu.revise_package( - package_id, update={"dpp_suggestions": revise_update_content} - ) - logger.trace(f"Package after revising: {revised_package}") - package = revised_package - logger.info(status_msg) - except Exception as e: - logger.error(f"Error revising package: {str(e)}") - - # -------------------- FORMULAE PROCESSING DONE -------------------- - formulae_elapsed = time.perf_counter() - formulae_start - logger.info( - f"FORMULAE PROCESSING DONE! Processed in {formulae_elapsed:,.2f} seconds." - ) - - # ============================================================ - # UPDATE RESOURCE METADATA - # ============================================================ - metadata_start = time.perf_counter() - logger.info("UPDATING RESOURCE METADATA...") - - # --------------------- AUTO-ALIASING ------------------------ - # aliases are human-readable, and make it easier to use than resource id hash - # when using the Datastore API and in SQL queries - alias = None - if conf.AUTO_ALIAS: - logger.info(f"AUTO-ALIASING. Auto-alias-unique: {conf.AUTO_ALIAS_UNIQUE} ...") - # get package info, so we can construct the alias - package = dsu.get_package(resource["package_id"]) - - resource_name = resource.get("name") - package_name = package.get("name") - owner_org = package.get("organization") - owner_org_name = "" - if owner_org: - owner_org_name = owner_org.get("name") - if resource_name and package_name and owner_org_name: - # we limit it to 55, so we still have space for sequence & stats suffix - # postgres max identifier length is 63 - alias = f"{resource_name}-{package_name}-{owner_org_name}"[:55] - # if AUTO_ALIAS_UNIQUE is true, check if the alias already exist, if it does - # add a sequence suffix so the new alias can be created - cur.execute( - "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of", - (alias + "%",), - ) - alias_query_result = cur.fetchone() - if alias_query_result: - alias_count = alias_query_result[0] - existing_alias_of = alias_query_result[1] - else: - alias_count = 0 - existing_alias_of = "" - if conf.AUTO_ALIAS_UNIQUE and alias_count > 1: - alias_sequence = alias_count + 1 - while True: - # we do this, so we're certain the new alias does not exist - # just in case they deleted an older alias with a lower sequence # - alias = f"{alias}-{alias_sequence:03}" - cur.execute( - "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of;", - (alias + "%",), - ) - alias_exists = cur.fetchone()[0] - if not alias_exists: - break - alias_sequence += 1 - elif alias_count == 1: - logger.warning( - f'Dropping existing alias "{alias}" for resource "{existing_alias_of}"...' - ) - try: - cur.execute( - sql.SQL("DROP VIEW IF EXISTS {}").format(sql.Identifier(alias)) - ) - except psycopg2.Error as e: - logger.warning(f"Could not drop alias/view: {e}") - - else: - logger.warning( - f"Cannot create alias: {resource_name}-{package_name}-{owner_org}" - ) - alias = None - - # -------- should we ADD_SUMMARY_STATS_RESOURCE? ------------- - # by default, we only add summary stats if we're not doing a partial download - # (otherwise, you're summarizing the preview, not the whole file) - # That is, unless SUMMARY_STATS_WITH_PREVIEW is set to true - if conf.ADD_SUMMARY_STATS_RESOURCE or conf.SUMMARY_STATS_WITH_PREVIEW: - stats_resource_id = resource_id + "-stats" - - # check if the stats already exist - existing_stats = dsu.datastore_resource_exists(stats_resource_id) - # Delete existing summary-stats before proceeding. - if existing_stats: - logger.info(f'Deleting existing summary stats "{stats_resource_id}".') - - cur.execute( - "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;", - (stats_resource_id + "%",), - ) - stats_alias_result = cur.fetchone() - if stats_alias_result: - existing_stats_alias_of = stats_alias_result[0] - - dsu.delete_datastore_resource(existing_stats_alias_of) - dsu.delete_resource(existing_stats_alias_of) - - stats_aliases = [stats_resource_id] - if conf.AUTO_ALIAS: - auto_alias_stats_id = alias + "-stats" - stats_aliases.append(auto_alias_stats_id) - - # check if the summary-stats alias already exist. We need to do this as summary-stats resources - # may end up having the same alias if AUTO_ALIAS_UNIQUE is False, so we need to drop the - # existing summary stats-alias. - existing_alias_stats = dsu.datastore_resource_exists(auto_alias_stats_id) - # Delete existing auto-aliased summary-stats before proceeding. - if existing_alias_stats: - logger.info( - f'Deleting existing alias summary stats "{auto_alias_stats_id}".' - ) - - cur.execute( - "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;", - (auto_alias_stats_id + "%",), - ) - result = cur.fetchone() - if result: - existing_stats_alias_of = result[0] - - dsu.delete_datastore_resource(existing_stats_alias_of) - dsu.delete_resource(existing_stats_alias_of) - - # run stats on stats CSV to get header names and infer data types - # we don't need summary statistics, so use the --typesonly option - try: - qsv_stats_stats = qsv.stats( - qsv_stats_csv, - typesonly=True, - ) - except utils.JobError as e: - raise utils.JobError(f"Cannot run stats on CSV stats: {e}") - - stats_stats = str(qsv_stats_stats.stdout).strip() - stats_stats_dict = [ - dict(id=ele.split(",")[0], type=conf.TYPE_MAPPING[ele.split(",")[1]]) - for idx, ele in enumerate(stats_stats.splitlines()[1:], 1) - ] - - logger.info(f"stats_stats_dict: {stats_stats_dict}") - - resource_name = resource.get("name") - stats_resource = { - "package_id": resource["package_id"], - "name": resource_name + " - Summary Statistics", - "format": "CSV", - "mimetype": "text/csv", - } - stats_response = dsu.send_resource_to_datastore( - stats_resource, - resource_id=None, - headers=stats_stats_dict, - records=None, - aliases=stats_aliases, - calculate_record_count=False, - ) - - logger.info(f"stats_response: {stats_response}") - - new_stats_resource_id = stats_response["result"]["resource_id"] - - # now COPY the stats to the datastore - col_names_list = [h["id"] for h in stats_stats_dict] - logger.info( - f'ADDING SUMMARY STATISTICS {col_names_list} in "{new_stats_resource_id}" with alias/es "{stats_aliases}"...' - ) - - column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list) - - copy_sql = sql.SQL( - "COPY {} ({}) FROM STDIN " - "WITH (FORMAT CSV, " - "HEADER 1, ENCODING 'UTF8');" - ).format( - sql.Identifier(new_stats_resource_id), - column_names, - ) - - with open(qsv_stats_csv, "rb") as f: - try: - cur.copy_expert(copy_sql, f) - except psycopg2.Error as e: - raise utils.JobError(f"Postgres COPY failed: {e}") - - stats_resource["id"] = new_stats_resource_id - stats_resource["summary_statistics"] = True - stats_resource["summary_of_resource"] = resource_id - dsu.update_resource(stats_resource) - - cur.close() - raw_connection.commit() - raw_connection.close() - - resource["datastore_active"] = True - resource["total_record_count"] = record_count - if conf.PREVIEW_ROWS < record_count or (conf.PREVIEW_ROWS > 0): - resource["preview"] = True - resource["preview_rows"] = copied_count - else: - resource["preview"] = False - resource["preview_rows"] = None - resource["partial_download"] = False - dsu.update_resource(resource) - - # tell CKAN to calculate_record_count and set alias if set - dsu.send_resource_to_datastore( - resource=None, - resource_id=resource["id"], - headers=headers_dicts, - records=None, - aliases=alias, - calculate_record_count=True, - ) - - if alias: - logger.info(f'Created alias "{alias}" for "{resource_id}"...') - - metadata_elapsed = time.perf_counter() - metadata_start - logger.info( - f"RESOURCE METADATA UPDATES DONE! Resource metadata updated in {metadata_elapsed:,.2f} seconds." - ) - - # -------------------- DONE -------------------- - package.setdefault("dpp_suggestions", {})["STATUS"] = "DONE" - dsu.patch_package(package) - - total_elapsed = time.perf_counter() - timer_start - newline_var = "\n" - end_msg = f""" - DATAPUSHER+ JOB DONE! -   Download: {fetch_elapsed:,.2f} -   Analysis: {analysis_elapsed:,.2f}{(newline_var + f" PII Screening: {piiscreening_elapsed:,.2f}") if piiscreening_elapsed > 0 else ""} -   COPYing: {copy_elapsed:,.2f} -   Indexing: {index_elapsed:,.2f} -   Formulae processing: {formulae_elapsed:,.2f} -   Resource metadata updates: {metadata_elapsed:,.2f} - TOTAL ELAPSED TIME: {total_elapsed:,.2f} - """ - logger.info(end_msg) +""" +DataPusher Plus Jobs Module - Backward Compatibility Wrapper + +This file provides backward compatibility for code importing from the original +jobs.py module. The actual implementation has been refactored into a modular +pipeline architecture located in the jobs/ subdirectory. + +For the refactored implementation, see: +- jobs/pipeline.py - Main orchestration logic +- jobs/context.py - Processing context state +- jobs/stages/ - Individual pipeline stages + +Original implementation preserved in jobs_legacy.py for reference. +""" + +# Import and re-export main entry points from the refactored pipeline +from ckanext.datapusher_plus.jobs.pipeline import ( + datapusher_plus_to_datastore, + push_to_datastore, + validate_input, + callback_datapusher_hook, +) + +# Export all public functions +__all__ = [ + "datapusher_plus_to_datastore", + "push_to_datastore", + "validate_input", + "callback_datapusher_hook", +] diff --git a/ckanext/datapusher_plus/jobs/__init__.py b/ckanext/datapusher_plus/jobs/__init__.py new file mode 100644 index 0000000..53f1ba0 --- /dev/null +++ b/ckanext/datapusher_plus/jobs/__init__.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +""" +DataPusher Plus Jobs Module + +This module contains the refactored job processing pipeline for DataPusher Plus. +The monolithic jobs.py has been refactored into a clean pipeline architecture. +""" + +# Re-export main entry points for backward compatibility +from ckanext.datapusher_plus.jobs.pipeline import ( + datapusher_plus_to_datastore, + push_to_datastore, +) + +__all__ = [ + "datapusher_plus_to_datastore", + "push_to_datastore", +] diff --git a/ckanext/datapusher_plus/jobs/context.py b/ckanext/datapusher_plus/jobs/context.py new file mode 100644 index 0000000..d4f40ab --- /dev/null +++ b/ckanext/datapusher_plus/jobs/context.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" +ProcessingContext for the DataPusher Plus pipeline. + +This class holds all state that is passed between pipeline stages. +""" + +import logging +from typing import Dict, Any, Optional, List +from dataclasses import dataclass, field + +from ckanext.datapusher_plus.qsv_utils import QSVCommand + + +@dataclass +class ProcessingContext: + """ + Context object that holds all state for the data processing pipeline. + + This object is passed through each stage of the pipeline and is modified + by each stage to track progress and intermediate results. + """ + + # Task/Job identification + task_id: str + input: Dict[str, Any] + dry_run: bool = False + + # Directories and file paths + temp_dir: str = "" + tmp: str = "" # Current working CSV file (changes throughout pipeline) + + # Logging and utilities + logger: Optional[logging.Logger] = None + qsv: Optional[QSVCommand] = None + + # Resource information (from CKAN) + resource: Dict[str, Any] = field(default_factory=dict) + resource_id: str = "" + resource_url: str = "" + ckan_url: str = "" + + # Headers and schema + headers_dicts: List[Dict[str, Any]] = field(default_factory=list) + headers: List[str] = field(default_factory=list) + original_header_dict: Dict[int, str] = field(default_factory=dict) + + # Statistics and metadata + dataset_stats: Dict[str, Any] = field(default_factory=dict) + resource_fields_stats: Dict[str, Any] = field(default_factory=dict) + resource_fields_freqs: Dict[str, Any] = field(default_factory=dict) + + # Datastore information + existing_info: Optional[Dict[str, Any]] = None + rows_to_copy: int = 0 + copied_count: int = 0 + + # Timing information + timer_start: float = 0.0 + + # Processing flags and results + pii_found: bool = False + file_hash: str = "" + content_length: int = 0 + + # Intermediate files (for tracking) + qsv_index_file: str = "" + + @property + def metadata(self) -> Dict[str, Any]: + """Convenience property to access input metadata.""" + return self.input.get("metadata", {}) + + def update_tmp(self, new_tmp: str) -> None: + """ + Update the current working CSV file path. + + Args: + new_tmp: Path to the new temporary CSV file + """ + self.tmp = new_tmp + self.logger.log(5, f"Updated tmp file to: {new_tmp}") # TRACE level + + def add_stat(self, key: str, value: Any) -> None: + """ + Add a statistic to the dataset stats. + + Args: + key: Statistics key + value: Statistics value + """ + self.dataset_stats[key] = value diff --git a/ckanext/datapusher_plus/jobs/pipeline.py b/ckanext/datapusher_plus/jobs/pipeline.py new file mode 100644 index 0000000..41d3e52 --- /dev/null +++ b/ckanext/datapusher_plus/jobs/pipeline.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- +""" +DataPusher Plus Pipeline + +Main orchestration logic for the refactored jobs module. +""" + +import sys +import time +import logging +import tempfile +import traceback +import sqlalchemy as sa +from pathlib import Path +from typing import Dict, Any, Optional, List +from rq import get_current_job + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.helpers as dph +import ckanext.datapusher_plus.config as conf +import ckanext.datapusher_plus.datastore_utils as dsu +from ckanext.datapusher_plus.logging_utils import TRACE +from ckanext.datapusher_plus.qsv_utils import QSVCommand +from ckanext.datapusher_plus.jobs.context import ProcessingContext +from ckanext.datapusher_plus.jobs.stages.download import DownloadStage +from ckanext.datapusher_plus.jobs.stages.format_converter import FormatConverterStage +from ckanext.datapusher_plus.jobs.stages.validation import ValidationStage +from ckanext.datapusher_plus.jobs.stages.analysis import AnalysisStage +from ckanext.datapusher_plus.jobs.stages.database import DatabaseStage +from ckanext.datapusher_plus.jobs.stages.indexing import IndexingStage +from ckanext.datapusher_plus.jobs.stages.formula import FormulaStage +from ckanext.datapusher_plus.jobs.stages.metadata import MetadataStage + + +# Re-export validation functions for backward compatibility +def validate_input(input: Dict[str, Any]) -> None: + """ + Validates input dictionary contains required metadata and resource_id. + + Args: + input: Input dictionary + + Raises: + utils.JobError: If validation fails + """ + if "metadata" not in input: + raise utils.JobError("Metadata missing") + + data = input["metadata"] + + if "resource_id" not in data: + raise utils.JobError("No id provided.") + + +def callback_datapusher_hook(result_url: str, job_dict: Dict[str, Any]) -> bool: + """ + Sends callback to CKAN with job status updates. + + Args: + result_url: URL to send callback to + job_dict: Job status dictionary + + Returns: + True if callback successful, False otherwise + """ + import json + import requests + + api_token = utils.get_dp_plus_user_apitoken() + headers: Dict[str, str] = { + "Content-Type": "application/json", + "Authorization": api_token, + } + + try: + result = requests.post( + result_url, + data=json.dumps(job_dict, cls=utils.DatetimeJsonEncoder), + verify=conf.SSL_VERIFY, + headers=headers, + ) + except requests.ConnectionError: + return False + + return result.status_code == requests.codes.ok + + +def datapusher_plus_to_datastore(input: Dict[str, Any]) -> Optional[str]: + """ + Main function called by the datapusher_plus worker. + + Errors are caught and logged in the database. + + Args: + input: Dictionary containing metadata and other job information + + Returns: + Optional[str]: Returns "error" if there was an error, None otherwise + """ + job_dict: Dict[str, Any] = dict(metadata=input["metadata"], status="running") + callback_datapusher_hook(result_url=input["result_url"], job_dict=job_dict) + + job_id = get_current_job().id + errored = False + try: + push_to_datastore(input, job_id) + job_dict["status"] = "complete" + dph.mark_job_as_completed(job_id, job_dict) + except utils.JobError as e: + dph.mark_job_as_errored(job_id, str(e)) + job_dict["status"] = "error" + job_dict["error"] = str(e) + log = logging.getLogger(__name__) + log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}") + errored = True + except Exception as e: + dph.mark_job_as_errored( + job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e) + ) + job_dict["status"] = "error" + job_dict["error"] = str(e) + log = logging.getLogger(__name__) + log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}") + errored = True + finally: + is_saved_ok = callback_datapusher_hook( + result_url=input["result_url"], job_dict=job_dict + ) + errored = errored or not is_saved_ok + return "error" if errored else None + + +def push_to_datastore( + input: Dict[str, Any], task_id: str, dry_run: bool = False +) -> Optional[List[Dict[str, Any]]]: + """ + Download and parse a resource push its data into CKAN's DataStore. + + An asynchronous job that gets a resource from CKAN, downloads the + resource's data file and, if the data file has changed since last time, + parses the data and posts it into CKAN's DataStore. + + Args: + input: Dictionary containing metadata and other job information + task_id: Unique identifier for the task + dry_run: If True, fetch and parse the data file but don't actually post the + data to the DataStore, instead return the data headers and rows that + would have been posted. + + Returns: + Optional[List[Dict[str, Any]]]: If dry_run is True, returns the headers and rows + that would have been posted. Otherwise returns None. + """ + # Ensure temporary files are removed after run + with tempfile.TemporaryDirectory() as temp_dir: + return _push_to_datastore(task_id, input, dry_run=dry_run, temp_dir=temp_dir) + + +def _push_to_datastore( + task_id: str, + input: Dict[str, Any], + dry_run: bool = False, + temp_dir: Optional[str] = None, +) -> Optional[List[Dict[str, Any]]]: + """ + Internal function that processes the resource through the pipeline. + + Args: + task_id: Unique task identifier + input: Input dictionary with metadata + dry_run: If True, don't actually push to datastore + temp_dir: Temporary directory path + + Returns: + Optional list of headers dicts if dry_run is True + """ + # Register job + try: + dph.add_pending_job(task_id, **input) + except sa.exc.IntegrityError: + raise utils.JobError("Job already exists.") + + # Setup logging + handler = utils.StoringHandler(task_id, input) + logger = logging.getLogger(task_id) + logger.addHandler(handler) + logger.addHandler(logging.StreamHandler()) + + # Set log level + try: + log_level = getattr(logging, conf.UPLOAD_LOG_LEVEL.upper()) + except AttributeError: + log_level = TRACE + + logger.setLevel(logging.INFO) + logger.info(f"Setting log level to {logging.getLevelName(int(log_level))}") + logger.setLevel(log_level) + + # Validate QSV binary exists + if not Path(conf.QSV_BIN).is_file(): + raise utils.JobError(f"{conf.QSV_BIN} not found.") + + # Initialize QSV + qsv = QSVCommand(logger=logger) + + # Validate input + validate_input(input) + + # Extract metadata + data = input["metadata"] + ckan_url = data["ckan_url"] + resource_id = data["resource_id"] + + # Fetch resource + try: + resource = dsu.get_resource(resource_id) + except utils.JobError: + # Retry once after 5 seconds + time.sleep(5) + resource = dsu.get_resource(resource_id) + + # Check if resource is datastore type + if resource.get("url_type") == "datastore": + logger.info("Dump files are managed with the Datastore API") + return + + # Create processing context + context = ProcessingContext( + task_id=task_id, + input=input, + dry_run=dry_run, + temp_dir=temp_dir, + logger=logger, + qsv=qsv, + resource=resource, + resource_id=resource_id, + ckan_url=ckan_url, + ) + + # Create and run pipeline + pipeline = DataProcessingPipeline() + result_context = pipeline.execute(context) + + # Return headers if dry run + if dry_run and result_context: + return result_context.headers_dicts + + return None + + +class DataProcessingPipeline: + """ + Orchestrates the data processing pipeline through sequential stages. + + Each stage processes the context and returns it (possibly modified). + If a stage returns None, the pipeline stops execution. + """ + + def __init__(self): + """Initialize the pipeline with all processing stages.""" + self.stages = [ + DownloadStage(), + FormatConverterStage(), + ValidationStage(), + AnalysisStage(), + DatabaseStage(), + IndexingStage(), + FormulaStage(), + MetadataStage(), + ] + + def execute(self, context: ProcessingContext) -> Optional[ProcessingContext]: + """ + Execute all pipeline stages sequentially. + + Args: + context: Initial processing context + + Returns: + Final processing context, or None if pipeline was aborted + + Raises: + utils.JobError: If any stage fails + """ + for stage in self.stages: + try: + context = stage(context) + + # If stage returns None, stop pipeline + if context is None: + context.logger.info(f"Pipeline stopped after stage: {stage.name}") + return None + + except utils.JobError: + # Re-raise JobErrors as-is + raise + except Exception as e: + # Wrap other exceptions + raise utils.JobError( + f"Stage {stage.name} failed with error: {str(e)}" + ) from e + + context.logger.info("Pipeline completed successfully!") + return context diff --git a/ckanext/datapusher_plus/jobs/stages/__init__.py b/ckanext/datapusher_plus/jobs/stages/__init__.py new file mode 100644 index 0000000..bce429a --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/__init__.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +""" +Processing stages for the DataPusher Plus pipeline. + +Each stage handles a specific part of the ETL process. +""" + +from ckanext.datapusher_plus.jobs.stages.base import BaseStage + +__all__ = ["BaseStage"] diff --git a/ckanext/datapusher_plus/jobs/stages/analysis.py b/ckanext/datapusher_plus/jobs/stages/analysis.py new file mode 100644 index 0000000..0fbf522 --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/analysis.py @@ -0,0 +1,587 @@ +# -*- coding: utf-8 -*- +""" +Analysis stage for the DataPusher Plus pipeline. + +Handles type inference, statistics, frequency tables, and PII screening. +""" + +import os +import csv +import time +import json +from typing import List, Dict, Any + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.config as conf +import ckanext.datapusher_plus.datastore_utils as dsu +from ckanext.datapusher_plus.pii_screening import screen_for_pii +from ckanext.datapusher_plus.jobs.stages.base import BaseStage +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class AnalysisStage(BaseStage): + """ + Analyzes CSV file to infer types and generate statistics. + + Responsibilities: + - Extract and sanitize headers + - Infer data types + - Generate statistics + - Create frequency tables + - Generate preview if needed + - Normalize dates to RFC3339 + - Screen for PII + """ + + def __init__(self): + super().__init__(name="Analysis") + + def process(self, context: ProcessingContext) -> ProcessingContext: + """ + Analyze CSV file and infer schema. + + Args: + context: Processing context + + Returns: + Updated context with schema information + + Raises: + utils.JobError: If analysis fails + """ + analysis_start = time.perf_counter() + + # Extract headers and sanitize + original_header_dict = self._extract_headers(context) + self._sanitize_headers(context) + + # Create index for faster operations + self._create_index(context) + + # Get record count if not already available + record_count = context.dataset_stats.get("RECORD_COUNT") + if not record_count: + record_count = self._count_records(context) + + # Check if empty + if record_count == 0: + context.logger.warning("Upload skipped as there are zero records.") + return None + + # Log record count + unique_qualifier = "unique" if conf.DEDUP else "" + context.logger.info(f"{record_count} {unique_qualifier} records detected...") + + # Infer types and generate statistics + headers_dicts, datetimecols_list, resource_fields_stats = ( + self._infer_types_and_stats(context, original_header_dict) + ) + + # Store headers in context + context.headers_dicts = headers_dicts + context.headers = [h["id"] for h in headers_dicts] + context.original_header_dict = original_header_dict + + # Generate frequency tables + resource_fields_freqs = self._generate_frequency_tables(context) + + # Update field stats with frequency data + for field, freqs in resource_fields_freqs.items(): + if field in resource_fields_stats: + resource_fields_stats[field]["freqs"] = freqs + + # Store field stats in context for FormulaStage + context.resource_fields_stats = resource_fields_stats + context.resource_fields_freqs = resource_fields_freqs + + # Generate preview if needed + context.rows_to_copy = record_count + if conf.PREVIEW_ROWS and record_count > conf.PREVIEW_ROWS: + context.rows_to_copy = self._generate_preview(context, record_count) + + # Normalize dates to RFC3339 + if datetimecols_list: + self._normalize_dates(context, datetimecols_list) + + # Analysis complete + analysis_elapsed = time.perf_counter() - analysis_start + context.logger.info( + f"ANALYSIS DONE! Analyzed and prepped in {analysis_elapsed:,.2f} seconds." + ) + + # PII Screening + self._screen_pii(context) + + # Remove index file + if context.qsv_index_file and os.path.exists(context.qsv_index_file): + os.remove(context.qsv_index_file) + + return context + + def _extract_headers(self, context: ProcessingContext) -> Dict[int, str]: + """ + Extract original headers from CSV. + + Args: + context: Processing context + + Returns: + Dictionary mapping column index to original header name + + Raises: + utils.JobError: If headers cannot be extracted + """ + try: + qsv_headers = context.qsv.headers(context.tmp, just_names=True) + except utils.JobError as e: + raise utils.JobError(f"Cannot scan CSV headers: {e}") + + original_headers = str(qsv_headers.stdout).strip() + original_header_dict = { + idx: ele for idx, ele in enumerate(original_headers.splitlines()) + } + return original_header_dict + + def _sanitize_headers(self, context: ProcessingContext) -> None: + """ + Sanitize headers to be database-safe. + + Args: + context: Processing context + + Raises: + utils.JobError: If header sanitization fails + """ + context.logger.info('Checking for "database-safe" header names...') + + try: + qsv_safenames = context.qsv.safenames( + context.tmp, + mode="json", + reserved=conf.RESERVED_COLNAMES, + prefix=conf.UNSAFE_PREFIX, + uses_stdio=True, + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot scan CSV headers: {e}") + + unsafe_json = json.loads(str(qsv_safenames.stdout)) + unsafe_headers = unsafe_json["unsafe_headers"] + + if unsafe_headers: + context.logger.info( + f'"{len(unsafe_headers)} unsafe" header names found ' + f"({unsafe_headers}). Sanitizing...\"" + ) + qsv_safenames_csv = os.path.join(context.temp_dir, "qsv_safenames.csv") + context.qsv.safenames( + context.tmp, mode="conditional", output_file=qsv_safenames_csv + ) + context.update_tmp(qsv_safenames_csv) + else: + context.logger.info("No unsafe header names found...") + + def _create_index(self, context: ProcessingContext) -> None: + """ + Create QSV index for faster operations. + + Args: + context: Processing context + + Raises: + utils.JobError: If index creation fails + """ + try: + context.qsv_index_file = context.tmp + ".idx" + context.qsv.index(context.tmp) + except utils.JobError as e: + raise utils.JobError(f"Cannot index CSV: {e}") + + def _count_records(self, context: ProcessingContext) -> int: + """ + Count records in CSV. + + Args: + context: Processing context + + Returns: + Number of records + + Raises: + utils.JobError: If counting fails + """ + try: + qsv_count = context.qsv.count(context.tmp) + record_count = int(str(qsv_count.stdout).strip()) + context.add_stat("RECORD_COUNT", record_count) + return record_count + except utils.JobError as e: + raise utils.JobError(f"Cannot count records in CSV: {e}") + + def _infer_types_and_stats( + self, context: ProcessingContext, original_header_dict: Dict[int, str] + ) -> tuple[List[Dict[str, Any]], List[str], Dict[str, Any]]: + """ + Infer data types and compile statistics. + + Args: + context: Processing context + original_header_dict: Mapping of column index to original header + + Returns: + Tuple of (headers_dicts, datetimecols_list, resource_fields_stats) + + Raises: + utils.JobError: If type inference fails + """ + context.logger.info("Inferring data types and compiling statistics...") + + qsv_stats_csv = os.path.join(context.temp_dir, "qsv_stats.csv") + + # Determine if we need special handling for spatial formats + spatial_format_flag = context.resource.get("format", "").upper() in [ + "SHP", + "QGIS", + "GEOJSON", + ] + + # Run qsv stats + try: + if spatial_format_flag: + env = os.environ.copy() + env["QSV_STATS_STRING_MAX_LENGTH"] = str( + conf.QSV_STATS_STRING_MAX_LENGTH + ) + context.qsv.stats( + context.tmp, + infer_dates=True, + dates_whitelist=conf.QSV_DATES_WHITELIST, + stats_jsonl=True, + prefer_dmy=conf.PREFER_DMY, + cardinality=bool(conf.AUTO_INDEX_THRESHOLD), + summary_stats_options=conf.SUMMARY_STATS_OPTIONS, + output_file=qsv_stats_csv, + env=env, + ) + else: + context.qsv.stats( + context.tmp, + infer_dates=True, + dates_whitelist=conf.QSV_DATES_WHITELIST, + stats_jsonl=True, + prefer_dmy=conf.PREFER_DMY, + cardinality=bool(conf.AUTO_INDEX_THRESHOLD), + summary_stats_options=conf.SUMMARY_STATS_OPTIONS, + output_file=qsv_stats_csv, + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot infer data types and compile statistics: {e}") + + # Parse stats + return self._parse_stats( + context, qsv_stats_csv, original_header_dict + ) + + def _parse_stats( + self, + context: ProcessingContext, + stats_csv: str, + original_header_dict: Dict[int, str], + ) -> tuple[List[Dict[str, Any]], List[str], Dict[str, Any]]: + """ + Parse statistics CSV and build headers dictionary. + + Args: + context: Processing context + stats_csv: Path to stats CSV + original_header_dict: Mapping of column index to original header + + Returns: + Tuple of (headers_dicts, datetimecols_list, resource_fields_stats) + """ + headers = [] + types = [] + headers_min = [] + headers_max = [] + headers_cardinality = [] + resource_fields_stats = {} + + with open(stats_csv, mode="r") as inp: + reader = csv.DictReader(inp) + for row in reader: + # Add to stats dictionary + resource_fields_stats[row["field"]] = {"stats": row} + + fr = {k: v for k, v in row.items()} + schema_field = fr.get("field", "Unnamed Column") + if schema_field.startswith("qsv_"): + break + + headers.append(schema_field) + types.append(fr.get("type", "String")) + headers_min.append(fr["min"]) + headers_max.append(fr["max"]) + if conf.AUTO_INDEX_THRESHOLD: + headers_cardinality.append(int(fr.get("cardinality") or 0)) + + # Store cardinality for indexing stage + if conf.AUTO_INDEX_THRESHOLD: + context.add_stat("HEADERS_CARDINALITY", headers_cardinality) + + # Check for existing datastore resource + existing = dsu.datastore_resource_exists(context.resource_id) + context.existing_info = None + if existing: + context.existing_info = dict( + (f["id"], f["info"]) for f in existing.get("fields", []) if "info" in f + ) + + # Override with types from Data Dictionary + if context.existing_info: + types = [ + { + "text": "String", + "numeric": "Float", + "timestamp": "DateTime", + }.get(context.existing_info.get(h, {}).get("type_override"), t) + for t, h in zip(types, headers) + ] + + # Delete existing datastore resource + if existing: + context.logger.info( + f'Deleting existing resource "{context.resource_id}" from datastore.' + ) + dsu.delete_datastore_resource(context.resource_id) + + # Build headers_dicts + headers_dicts, datetimecols_list = self._build_headers_dicts( + context, headers, types, headers_min, headers_max, original_header_dict + ) + + context.logger.info(f"Determined headers and types: {headers_dicts}...") + + return headers_dicts, datetimecols_list, resource_fields_stats + + def _build_headers_dicts( + self, + context: ProcessingContext, + headers: List[str], + types: List[str], + headers_min: List[str], + headers_max: List[str], + original_header_dict: Dict[int, str], + ) -> tuple[List[Dict[str, Any]], List[str]]: + """ + Build headers dictionaries with proper types. + + Args: + context: Processing context + headers: List of header names + types: List of inferred types + headers_min: List of minimum values + headers_max: List of maximum values + original_header_dict: Mapping of column index to original header + + Returns: + Tuple of (headers_dicts, datetimecols_list) + """ + default_type = "String" + temp_headers_dicts = [ + dict( + id=field[0], + type=conf.TYPE_MAPPING.get( + str(field[1]) if field[1] else default_type, "text" + ), + ) + for field in zip(headers, types) + ] + + # Build final headers_dicts with smartint resolution + datetimecols_list = [] + headers_dicts = [] + + for idx, header in enumerate(temp_headers_dicts): + if header["type"] == "smartint": + # Select best integer type based on min/max + if ( + int(headers_max[idx]) <= conf.POSTGRES_INT_MAX + and int(headers_min[idx]) >= conf.POSTGRES_INT_MIN + ): + header_type = "integer" + elif ( + int(headers_max[idx]) <= conf.POSTGRES_BIGINT_MAX + and int(headers_min[idx]) >= conf.POSTGRES_BIGINT_MIN + ): + header_type = "bigint" + else: + header_type = "numeric" + else: + header_type = header["type"] + + if header_type == "timestamp": + datetimecols_list.append(header["id"]) + + info_dict = dict(label=original_header_dict.get(idx, "Unnamed Column")) + headers_dicts.append( + dict(id=header["id"], type=header_type, info=info_dict) + ) + + # Preserve data dictionary from existing resource + if context.existing_info: + for h in headers_dicts: + if h["id"] in context.existing_info: + h["info"] = context.existing_info[h["id"]] + # Apply type overrides + type_override = context.existing_info[h["id"]].get("type_override") + if type_override in list(conf.TYPE_MAPPING.values()): + h["type"] = type_override + + return headers_dicts, datetimecols_list + + def _generate_frequency_tables( + self, context: ProcessingContext + ) -> Dict[str, List[Dict[str, str]]]: + """ + Generate frequency tables for each column. + + Args: + context: Processing context + + Returns: + Dictionary mapping field names to frequency data + + Raises: + utils.JobError: If frequency table generation fails + """ + qsv_freq_csv = os.path.join(context.temp_dir, "qsv_freq.csv") + + try: + context.qsv.frequency( + context.tmp, limit=conf.QSV_FREQ_LIMIT, output_file=qsv_freq_csv + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot create a frequency table: {e}") + + resource_fields_freqs = {} + try: + with open(qsv_freq_csv, "r") as f: + reader = csv.DictReader(f) + for row in reader: + field = row["field"] + if field not in resource_fields_freqs: + resource_fields_freqs[field] = [] + + resource_fields_freqs[field].append( + { + "value": row["value"], + "count": row["count"], + "percentage": row["percentage"], + } + ) + context.logger.log(5, f"Resource fields freqs: {resource_fields_freqs}") + except IOError as e: + raise utils.JobError(f"Could not open frequency CSV file: {e}") + + return resource_fields_freqs + + def _generate_preview(self, context: ProcessingContext, record_count: int) -> int: + """ + Generate a preview slice of the data. + + Args: + context: Processing context + record_count: Total number of records + + Returns: + Number of rows in preview + + Raises: + utils.JobError: If preview generation fails + """ + qsv_slice_csv = os.path.join(context.temp_dir, "qsv_slice.csv") + + if conf.PREVIEW_ROWS > 0: + # Positive: slice from beginning + context.logger.info(f"Preparing {conf.PREVIEW_ROWS}-row preview...") + try: + context.qsv.slice( + context.tmp, length=conf.PREVIEW_ROWS, output_file=qsv_slice_csv + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot create a preview slice: {e}") + rows_to_copy = conf.PREVIEW_ROWS + else: + # Negative: slice from end + slice_len = abs(conf.PREVIEW_ROWS) + context.logger.info(f"Preparing {slice_len}-row preview from the end...") + try: + context.qsv.slice( + context.tmp, start=-1, length=slice_len, output_file=qsv_slice_csv + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot create a preview slice from the end: {e}") + rows_to_copy = slice_len + + context.update_tmp(qsv_slice_csv) + context.add_stat("PREVIEW_FILE_SIZE", os.path.getsize(qsv_slice_csv)) + context.add_stat("PREVIEW_RECORD_COUNT", rows_to_copy) + + return rows_to_copy + + def _normalize_dates( + self, context: ProcessingContext, datetimecols_list: List[str] + ) -> None: + """ + Normalize date columns to RFC3339 format. + + Args: + context: Processing context + datetimecols_list: List of datetime column names + + Raises: + utils.JobError: If date normalization fails + """ + qsv_applydp_csv = os.path.join(context.temp_dir, "qsv_applydp.csv") + datecols = ",".join(datetimecols_list) + + context.logger.info( + f'Formatting dates "{datecols}" to ISO 8601/RFC 3339 format ' + f"with PREFER_DMY: {conf.PREFER_DMY}..." + ) + + try: + context.qsv.datefmt( + datecols, + context.tmp, + prefer_dmy=conf.PREFER_DMY, + output_file=qsv_applydp_csv, + ) + except utils.JobError as e: + raise utils.JobError(f"Applydp error: {e}") + + context.update_tmp(qsv_applydp_csv) + + def _screen_pii(self, context: ProcessingContext) -> None: + """ + Screen for Personally Identifiable Information. + + Args: + context: Processing context + """ + if conf.PII_SCREENING: + piiscreening_start = time.perf_counter() + context.pii_found = screen_for_pii( + context.tmp, + context.resource, + context.qsv, + context.temp_dir, + context.logger, + ) + piiscreening_elapsed = time.perf_counter() - piiscreening_start + context.logger.info( + f"PII screening completed in {piiscreening_elapsed:,.2f} seconds" + ) + + context.add_stat("PII_SCREENING", conf.PII_SCREENING) + context.add_stat("PII_FOUND", context.pii_found) diff --git a/ckanext/datapusher_plus/jobs/stages/base.py b/ckanext/datapusher_plus/jobs/stages/base.py new file mode 100644 index 0000000..bed8efa --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/base.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +""" +Base stage class for the DataPusher Plus pipeline. + +All pipeline stages inherit from this base class. +""" + +from abc import ABC, abstractmethod +from typing import Optional + +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class BaseStage(ABC): + """ + Abstract base class for all pipeline stages. + + Each stage processes the context and returns it (possibly modified). + Stages can skip processing by returning None. + """ + + def __init__(self, name: Optional[str] = None): + """ + Initialize the stage. + + Args: + name: Optional name for the stage (defaults to class name) + """ + self.name = name or self.__class__.__name__ + + @abstractmethod + def process(self, context: ProcessingContext) -> Optional[ProcessingContext]: + """ + Process the context through this stage. + + Args: + context: The processing context containing all state + + Returns: + The modified context, or None to skip this stage + + Raises: + utils.JobError: If processing fails + """ + pass + + def should_skip(self, context: ProcessingContext) -> bool: + """ + Determine if this stage should be skipped. + + Override this method to add conditional stage execution. + + Args: + context: The processing context + + Returns: + True if the stage should be skipped, False otherwise + """ + return False + + def __call__(self, context: ProcessingContext) -> Optional[ProcessingContext]: + """ + Make the stage callable. + + This allows stages to be used as: stage(context) + + Args: + context: The processing context + + Returns: + The modified context, or None to skip + """ + if self.should_skip(context): + context.logger.info(f"Skipping stage: {self.name}") + return context + + context.logger.info(f"Starting stage: {self.name}") + result = self.process(context) + context.logger.info(f"Completed stage: {self.name}") + return result + + def __repr__(self) -> str: + """String representation of the stage.""" + return f"<{self.name}>" diff --git a/ckanext/datapusher_plus/jobs/stages/database.py b/ckanext/datapusher_plus/jobs/stages/database.py new file mode 100644 index 0000000..e996be8 --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/database.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- +""" +Database stage for the DataPusher Plus pipeline. + +Handles copying data to the PostgreSQL datastore. +""" + +import time +import psycopg2 +from psycopg2 import sql + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.config as conf +import ckanext.datapusher_plus.datastore_utils as dsu +from ckanext.datapusher_plus.jobs.stages.base import BaseStage +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class DatabaseStage(BaseStage): + """ + Copies data to PostgreSQL datastore. + + Responsibilities: + - Create empty datastore table with schema + - Use PostgreSQL COPY to efficiently load data + - Run VACUUM ANALYZE for performance + """ + + def __init__(self): + super().__init__(name="Database") + + def should_skip(self, context: ProcessingContext) -> bool: + """Skip if in dry run mode.""" + return context.dry_run + + def process(self, context: ProcessingContext) -> ProcessingContext: + """ + Copy data to datastore. + + Args: + context: Processing context + + Returns: + Updated context + + Raises: + utils.JobError: If database operations fail + """ + if context.dry_run: + context.logger.warning( + "Dry run only. Returning without copying to the Datastore..." + ) + return context + + copy_start = time.perf_counter() + + if conf.PREVIEW_ROWS: + context.logger.info( + f"COPYING {context.rows_to_copy}-row preview to Datastore..." + ) + else: + context.logger.info( + f"COPYING {context.rows_to_copy} rows to Datastore..." + ) + + # Create empty datastore table + self._create_datastore_table(context) + + # Copy data using PostgreSQL COPY + copied_count = self._copy_data(context) + + context.copied_count = copied_count + + copy_elapsed = time.perf_counter() - copy_start + context.logger.info( + f'...copying done. Copied {copied_count} rows to "{context.resource_id}" ' + f"in {copy_elapsed:,.2f} seconds." + ) + + return context + + def _create_datastore_table(self, context: ProcessingContext) -> None: + """ + Create empty datastore table with schema. + + Args: + context: Processing context + """ + dsu.send_resource_to_datastore( + resource=None, + resource_id=context.resource["id"], + headers=context.headers_dicts, + records=None, + aliases=None, + calculate_record_count=False, + ) + + def _copy_data(self, context: ProcessingContext) -> int: + """ + Copy data to datastore using PostgreSQL COPY. + + Args: + context: Processing context + + Returns: + Number of rows copied + + Raises: + utils.JobError: If COPY operation fails + """ + try: + raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL) + except psycopg2.Error as e: + raise utils.JobError(f"Could not connect to the Datastore: {e}") + + try: + cur = raw_connection.cursor() + + # Truncate table for COPY FREEZE optimization + self._truncate_table(cur, context.resource_id) + + # Prepare COPY SQL + col_names_list = [h["id"] for h in context.headers_dicts] + column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list) + copy_sql = sql.SQL( + "COPY {} ({}) FROM STDIN " + "WITH (FORMAT CSV, FREEZE 1, " + "HEADER 1, ENCODING 'UTF8');" + ).format( + sql.Identifier(context.resource_id), + column_names, + ) + + # Execute COPY + with open(context.tmp, "rb", conf.COPY_READBUFFER_SIZE) as f: + try: + cur.copy_expert(copy_sql, f, size=conf.COPY_READBUFFER_SIZE) + except psycopg2.Error as e: + raise utils.JobError(f"Postgres COPY failed: {e}") + copied_count = cur.rowcount + + raw_connection.commit() + + # VACUUM ANALYZE for performance + self._vacuum_analyze(raw_connection, context.resource_id) + + return copied_count + + finally: + if raw_connection: + raw_connection.close() + + def _truncate_table(self, cursor: psycopg2.extensions.cursor, resource_id: str) -> None: + """ + Truncate table to enable COPY FREEZE optimization. + + Args: + cursor: Database cursor + resource_id: Resource ID (table name) + """ + try: + cursor.execute( + sql.SQL("TRUNCATE TABLE {}").format(sql.Identifier(resource_id)) + ) + except psycopg2.Error as e: + # Non-fatal, log warning but continue + # (table might not exist yet) + pass + + def _vacuum_analyze( + self, connection: psycopg2.extensions.connection, resource_id: str + ) -> None: + """ + Run VACUUM ANALYZE on the table. + + Args: + connection: Database connection + resource_id: Resource ID (table name) + """ + # Set isolation level for VACUUM + connection.set_isolation_level( + psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT + ) + + analyze_cur = connection.cursor() + try: + analyze_cur.execute( + sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id)) + ) + finally: + analyze_cur.close() diff --git a/ckanext/datapusher_plus/jobs/stages/download.py b/ckanext/datapusher_plus/jobs/stages/download.py new file mode 100644 index 0000000..14edd42 --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/download.py @@ -0,0 +1,379 @@ +# -*- coding: utf-8 -*- +""" +Download stage for the DataPusher Plus pipeline. + +Handles downloading resources, hash checking, and ZIP file extraction. +""" + +import os +import time +import hashlib +import mimetypes +from typing import Dict, Any +from urllib.parse import urlsplit, urlparse + +import requests +from datasize import DataSize +from dateutil.parser import parse as parsedate + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.helpers as dph +import ckanext.datapusher_plus.config as conf +from ckanext.datapusher_plus.job_exceptions import HTTPError +from ckanext.datapusher_plus.jobs.stages.base import BaseStage +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class DownloadStage(BaseStage): + """ + Downloads the resource file, validates it, and handles ZIP extraction. + + Responsibilities: + - Validate resource URL scheme + - Download file with authentication if needed + - Calculate file hash for deduplication + - Check if file has changed since last upload + - Extract ZIP files if applicable + """ + + def __init__(self): + super().__init__(name="Download") + + def process(self, context: ProcessingContext) -> ProcessingContext: + """ + Download and validate the resource file. + + Args: + context: Processing context + + Returns: + Updated context with downloaded file information + + Raises: + utils.JobError: If download fails or file is invalid + """ + # Validate resource URL scheme + self._validate_url_scheme(context) + + # Start timing + context.timer_start = time.perf_counter() + + # Download the file + file_hash, length, resource_format, response_headers = self._download_file(context) + + # Store file information + context.file_hash = file_hash + context.content_length = length + context.add_stat("ORIGINAL_FILE_SIZE", length) + + # Check for file deduplication + if self._should_skip_upload(context, file_hash, response_headers): + context.logger.warning( + f"Upload skipped as the file hash hasn't changed: {file_hash}." + ) + return None # Signal to skip further processing + + # Update resource hash + context.resource["hash"] = file_hash + + # Log download completion + fetch_elapsed = time.perf_counter() - context.timer_start + context.logger.info( + f"Fetched {DataSize(length):.2MB} file in {fetch_elapsed:,.2f} seconds." + ) + + # Handle ZIP file extraction + self._handle_zip_file(context, resource_format) + + return context + + def _validate_url_scheme(self, context: ProcessingContext) -> None: + """ + Validate that the resource URL uses an allowed scheme. + + Args: + context: Processing context + + Raises: + utils.JobError: If URL scheme is not allowed + """ + context.resource_url = context.resource.get("url") + scheme = urlsplit(context.resource_url).scheme + if scheme not in ("http", "https", "ftp"): + raise utils.JobError("Only http, https, and ftp resources may be fetched.") + + def _download_file( + self, context: ProcessingContext + ) -> tuple[str, int, str, Dict[str, Any]]: + """ + Download the resource file and calculate its hash. + + Args: + context: Processing context + + Returns: + Tuple of (file_hash, file_length, resource_format, response_headers) + + Raises: + HTTPError: If download fails + utils.JobError: If file is too large or format cannot be determined + """ + resource_url = context.resource_url + context.logger.info(f"Fetching from: {resource_url}...") + + # Prepare request headers + headers: Dict[str, str] = {} + if context.resource.get("url_type") == "upload": + # Authenticate for uploaded files + api_token = utils.get_dp_plus_user_apitoken() + headers["Authorization"] = api_token + + # Rewrite URL if needed (for firewalls) + resource_url = self._rewrite_url_if_needed( + context, resource_url, context.ckan_url + ) + + # Configure request + kwargs: Dict[str, Any] = { + "headers": headers, + "timeout": conf.TIMEOUT, + "verify": conf.SSL_VERIFY, + "stream": True, + } + if conf.USE_PROXY: + kwargs["proxies"] = { + "http": conf.DOWNLOAD_PROXY, + "https": conf.DOWNLOAD_PROXY, + } + + # Download file + try: + with requests.get(resource_url, **kwargs) as response: + response.raise_for_status() + + # Get content info + cl = response.headers.get("content-length") + max_content_length = conf.MAX_CONTENT_LENGTH + ct = response.headers.get("content-type") + + # Check size before download + if cl: + try: + if int(cl) > max_content_length and conf.PREVIEW_ROWS > 0: + raise utils.JobError( + f"Resource too large to download: {DataSize(int(cl)):.2MB} " + f"> max ({DataSize(int(max_content_length)):.2MB})." + ) + except ValueError: + pass + + # Determine file format + resource_format = self._determine_format( + context, ct, response.headers + ) + + # Download and hash the file + file_hash, length = self._stream_download( + context, resource_format, response, max_content_length + ) + + return file_hash, length, resource_format, dict(response.headers) + + except requests.HTTPError as e: + raise HTTPError( + f"DataPusher+ received a bad HTTP response when trying to download " + f"the data file from {resource_url}. Status code: {e.response.status_code}, " + f"Response content: {e.response.content}", + status_code=e.response.status_code, + request_url=resource_url, + response=e.response.content, + ) + except requests.RequestException as e: + raise HTTPError( + message=str(e), + status_code=None, + request_url=resource_url, + response=None, + ) + + def _rewrite_url_if_needed( + self, context: ProcessingContext, resource_url: str, ckan_url: str + ) -> str: + """ + Rewrite URL if CKAN is behind a firewall. + + Args: + context: Processing context + resource_url: Original resource URL + ckan_url: CKAN base URL + + Returns: + Potentially rewritten URL + """ + if not resource_url.startswith(ckan_url): + new_url = urlparse(resource_url) + rewrite_url = urlparse(ckan_url) + new_url = new_url._replace( + scheme=rewrite_url.scheme, netloc=rewrite_url.netloc + ) + resource_url = new_url.geturl() + context.logger.info(f"Rewritten resource url to: {resource_url}") + return resource_url + + def _determine_format( + self, context: ProcessingContext, content_type: str, headers: Dict[str, Any] + ) -> str: + """ + Determine the file format from resource metadata or content type. + + Args: + context: Processing context + content_type: HTTP content-type header + headers: Response headers + + Returns: + File format string (uppercase) + + Raises: + utils.JobError: If format cannot be determined + """ + resource_format = context.resource.get("format", "").upper() + + if not resource_format: + context.logger.info("File format: NOT SPECIFIED") + if content_type: + extension = mimetypes.guess_extension(content_type.split(";")[0]) + if extension is None: + raise utils.JobError( + "Cannot determine format from mime type. Please specify format." + ) + resource_format = extension.lstrip(".").upper() + context.logger.info(f"Inferred file format: {resource_format}") + else: + raise utils.JobError( + "Server did not return content-type. Please specify format." + ) + else: + context.logger.info(f"File format: {resource_format}") + + return resource_format + + def _stream_download( + self, + context: ProcessingContext, + resource_format: str, + response: requests.Response, + max_content_length: int, + ) -> tuple[str, int]: + """ + Stream download the file and calculate its hash. + + Args: + context: Processing context + resource_format: File format extension + response: HTTP response object + max_content_length: Maximum allowed file size + + Returns: + Tuple of (file_hash, file_length) + + Raises: + utils.JobError: If file exceeds maximum size + """ + tmp = os.path.join(context.temp_dir, "tmp." + resource_format) + context.update_tmp(tmp) + + length = 0 + # Using MD5 for file deduplication only (not for security) + m = hashlib.md5() # DevSkim: ignore DS126858 + + # Log download start + cl = response.headers.get("content-length") + if cl: + context.logger.info(f"Downloading {DataSize(int(cl)):.2MB} file...") + else: + context.logger.info("Downloading file of unknown size...") + + # Stream download + with open(tmp, "wb") as tmp_file: + for chunk in response.iter_content(conf.CHUNK_SIZE): + length += len(chunk) + if length > max_content_length and not conf.PREVIEW_ROWS: + raise utils.JobError( + f"Resource too large to process: {length} > max ({max_content_length})." + ) + tmp_file.write(chunk) + m.update(chunk) + + return m.hexdigest(), length + + def _should_skip_upload( + self, + context: ProcessingContext, + file_hash: str, + response_headers: Dict[str, Any], + ) -> bool: + """ + Check if upload should be skipped due to unchanged file. + + Args: + context: Processing context + file_hash: MD5 hash of downloaded file + response_headers: HTTP response headers + + Returns: + True if upload should be skipped, False otherwise + """ + # Check if resource metadata was updated + resource_updated = False + resource_last_modified = context.resource.get("last_modified") + if resource_last_modified: + resource_last_modified = parsedate(resource_last_modified) + file_last_modified = response_headers.get("last-modified") + if file_last_modified: + file_last_modified = parsedate(file_last_modified).replace(tzinfo=None) + if file_last_modified < resource_last_modified: + resource_updated = True + + # Skip if hash matches and not forced + metadata = context.metadata + return ( + context.resource.get("hash") == file_hash + and not metadata.get("ignore_hash") + and not conf.IGNORE_FILE_HASH + and not resource_updated + ) + + def _handle_zip_file(self, context: ProcessingContext, resource_format: str) -> None: + """ + Extract ZIP file if applicable. + + Args: + context: Processing context + resource_format: File format + + Returns: + None, but updates context.tmp if ZIP is extracted + """ + if resource_format.upper() == "ZIP": + context.logger.info("Processing ZIP file...") + + file_count, extracted_path, unzipped_format = dph.extract_zip_or_metadata( + context.tmp, context.temp_dir, context.logger + ) + + if not file_count: + context.logger.error("ZIP file invalid or no files found in ZIP file.") + return None + + if file_count > 1: + context.logger.info( + f"More than one file in the ZIP file ({file_count} files), " + f"saving metadata..." + ) + else: + context.logger.info( + f"Extracted {unzipped_format} file: {extracted_path}" + ) + + context.update_tmp(extracted_path) diff --git a/ckanext/datapusher_plus/jobs/stages/format_converter.py b/ckanext/datapusher_plus/jobs/stages/format_converter.py new file mode 100644 index 0000000..269144c --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/format_converter.py @@ -0,0 +1,382 @@ +# -*- coding: utf-8 -*- +""" +Format Converter stage for the DataPusher Plus pipeline. + +Handles conversion of various file formats to CSV. +""" + +import os +import uuid +import subprocess +from typing import Optional + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.config as conf +import ckanext.datapusher_plus.spatial_helpers as sh +import ckanext.datapusher_plus.datastore_utils as dsu +from ckanext.datapusher_plus.jobs.stages.base import BaseStage +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class FormatConverterStage(BaseStage): + """ + Converts various file formats to CSV. + + Responsibilities: + - Convert spreadsheets (XLS, XLSX, ODS, etc.) to CSV + - Convert spatial formats (SHP, GEOJSON) to CSV + - Normalize CSV/TSV/TAB files + - Transcode to UTF-8 + """ + + # Supported format types + SPREADSHEET_EXTENSIONS = ["XLS", "XLSX", "ODS", "XLSM", "XLSB"] + SPATIAL_FORMATS = ["SHP", "QGIS", "GEOJSON"] + + def __init__(self): + super().__init__(name="FormatConverter") + + def process(self, context: ProcessingContext) -> ProcessingContext: + """ + Convert file format to CSV. + + Args: + context: Processing context + + Returns: + Updated context with CSV file + + Raises: + utils.JobError: If conversion fails + """ + resource_format = context.resource.get("format", "").upper() + + # Check if file is a spreadsheet + if resource_format in self.SPREADSHEET_EXTENSIONS: + self._convert_spreadsheet(context, resource_format) + # Check if file is a spatial format + elif resource_format in self.SPATIAL_FORMATS: + self._convert_spatial_format(context, resource_format) + # Otherwise normalize as CSV/TSV/TAB + else: + self._normalize_csv(context, resource_format) + + return context + + def _convert_spreadsheet( + self, context: ProcessingContext, file_format: str + ) -> None: + """ + Convert spreadsheet to CSV using qsv excel. + + Args: + context: Processing context + file_format: Spreadsheet format (XLS, XLSX, etc.) + + Raises: + utils.JobError: If conversion fails + """ + default_excel_sheet = conf.DEFAULT_EXCEL_SHEET + context.logger.info( + f"Converting {file_format} sheet {default_excel_sheet} to CSV..." + ) + + # Create hardlink with proper extension + qsv_spreadsheet = os.path.join( + context.temp_dir, "qsv_spreadsheet." + file_format + ) + os.link(context.tmp, qsv_spreadsheet) + + # Run qsv excel to export to CSV + qsv_excel_csv = os.path.join(context.temp_dir, "qsv_excel.csv") + try: + qsv_excel = context.qsv.excel( + qsv_spreadsheet, + sheet=default_excel_sheet, + trim=True, + output_file=qsv_excel_csv, + ) + except utils.JobError as e: + raise utils.JobError( + f"Upload aborted. Cannot export spreadsheet(?) to CSV: {e}" + ) + + excel_export_msg = qsv_excel.stderr + context.logger.info(f"{excel_export_msg}...") + context.update_tmp(qsv_excel_csv) + + def _convert_spatial_format( + self, context: ProcessingContext, resource_format: str + ) -> None: + """ + Convert spatial format to CSV. + + Args: + context: Processing context + resource_format: Spatial format (SHP, GEOJSON, etc.) + + Raises: + utils.JobError: If conversion fails + """ + context.logger.info("SHAPEFILE or GEOJSON file detected...") + + # Create unique spatial file + qsv_spatial_file = os.path.join( + context.temp_dir, + f"qsv_spatial_{uuid.uuid4()}.{resource_format}", + ) + os.link(context.tmp, qsv_spatial_file) + qsv_spatial_csv = os.path.join(context.temp_dir, "qsv_spatial.csv") + + simplification_failed = False + + # Try spatial simplification if enabled + if conf.AUTO_SPATIAL_SIMPLIFICATION: + simplification_failed = not self._try_spatial_simplification( + context, qsv_spatial_file, qsv_spatial_csv, resource_format + ) + + # Fallback to qsv geoconvert if simplification failed or disabled + if not conf.AUTO_SPATIAL_SIMPLIFICATION or simplification_failed: + self._geoconvert(context, qsv_spatial_file, resource_format) + + def _try_spatial_simplification( + self, + context: ProcessingContext, + spatial_file: str, + output_csv: str, + resource_format: str, + ) -> bool: + """ + Try to convert and simplify spatial file. + + Args: + context: Processing context + spatial_file: Path to spatial file + output_csv: Output CSV path + resource_format: Spatial format + + Returns: + True if successful, False otherwise + """ + context.logger.info( + f"Converting spatial file to CSV with a simplification relative " + f"tolerance of {conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE}..." + ) + + try: + success, error_message, bounds = sh.process_spatial_file( + spatial_file, + resource_format, + output_csv_path=output_csv, + tolerance=conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE, + task_logger=context.logger, + ) + + if success: + context.logger.info( + "Spatial file successfully simplified and converted to CSV" + ) + context.update_tmp(output_csv) + self._upload_simplified_resource(context, spatial_file, bounds) + return True + else: + context.logger.warning( + f"Upload of simplified spatial file failed: {error_message}" + ) + return False + + except Exception as e: + context.logger.warning(f"Simplification and conversion failed: {str(e)}") + context.logger.warning( + f"Simplification failed. Using qsv geoconvert to convert to CSV, " + f"truncating large columns to {conf.QSV_STATS_STRING_MAX_LENGTH} characters..." + ) + return False + + def _upload_simplified_resource( + self, context: ProcessingContext, spatial_file: str, bounds: Optional[tuple] + ) -> None: + """ + Upload simplified spatial resource to CKAN. + + Args: + context: Processing context + spatial_file: Path to simplified spatial file + bounds: Bounding box coordinates (minx, miny, maxx, maxy) + """ + resource = context.resource + simplified_resource_name = ( + os.path.splitext(resource["name"])[0] + + "_simplified" + + os.path.splitext(resource["name"])[1] + ) + + existing_resource, existing_resource_id = dsu.resource_exists( + resource["package_id"], simplified_resource_name + ) + + if existing_resource: + context.logger.info("Simplified resource already exists. Replacing it...") + dsu.delete_resource(existing_resource_id) + else: + context.logger.info("Simplified resource does not exist. Uploading it...") + + new_simplified_resource = { + "package_id": resource["package_id"], + "name": simplified_resource_name, + "url": "", + "format": resource["format"], + "hash": "", + "mimetype": resource["mimetype"], + "mimetype_inner": resource["mimetype_inner"], + } + + # Add bounds information if available + if bounds: + minx, miny, maxx, maxy = bounds + new_simplified_resource.update( + { + "dpp_spatial_extent": { + "type": "BoundingBox", + "coordinates": [[minx, miny], [maxx, maxy]], + } + } + ) + context.logger.info( + f"Added dpp_spatial_extent to resource metadata: {bounds}" + ) + + dsu.upload_resource(new_simplified_resource, spatial_file) + os.remove(spatial_file) + + def _geoconvert( + self, context: ProcessingContext, spatial_file: str, resource_format: str + ) -> None: + """ + Convert spatial file using qsv geoconvert. + + Args: + context: Processing context + spatial_file: Path to spatial file + resource_format: Spatial format + + Raises: + utils.JobError: If geoconvert fails + """ + context.logger.info("Converting spatial file to CSV using qsv geoconvert...") + + qsv_geoconvert_csv = os.path.join(context.temp_dir, "qsv_geoconvert.csv") + try: + context.qsv.geoconvert( + context.tmp, + resource_format, + "csv", + max_length=conf.QSV_STATS_STRING_MAX_LENGTH, + output_file=qsv_geoconvert_csv, + ) + except utils.JobError as e: + raise utils.JobError(f"qsv geoconvert failed: {e}") + + context.update_tmp(qsv_geoconvert_csv) + context.logger.info("Geoconverted successfully") + + def _normalize_csv(self, context: ProcessingContext, resource_format: str) -> None: + """ + Normalize CSV/TSV/TAB and transcode to UTF-8. + + Args: + context: Processing context + resource_format: File format + + Raises: + utils.JobError: If normalization fails + """ + # Log appropriate message + if resource_format == "CSV": + context.logger.info(f"Normalizing/UTF-8 transcoding {resource_format}...") + else: + context.logger.info( + f"Normalizing/UTF-8 transcoding {resource_format} to CSV..." + ) + + qsv_input_csv = os.path.join(context.temp_dir, "qsv_input.csv") + qsv_input_utf_8_encoded_csv = os.path.join( + context.temp_dir, "qsv_input_utf_8_encoded.csv" + ) + + # Detect file encoding + encoding = self._detect_encoding(context) + + # Re-encode to UTF-8 if needed + if encoding not in ("UTF-8", "ASCII"): + context.logger.info(f"File is not UTF-8 encoded. Re-encoding from {encoding} to UTF-8") + self._reencode_to_utf8(context, encoding, qsv_input_utf_8_encoded_csv) + source_file = qsv_input_utf_8_encoded_csv + else: + source_file = context.tmp + + # Normalize using qsv input + try: + context.qsv.input(source_file, trim_headers=True, output_file=qsv_input_csv) + except utils.JobError as e: + raise utils.JobError( + f"Job aborted as the file cannot be normalized/transcoded: {e}." + ) + + context.update_tmp(qsv_input_csv) + context.logger.info("Normalized & transcoded...") + + def _detect_encoding(self, context: ProcessingContext) -> str: + """ + Detect file encoding using uchardet. + + Args: + context: Processing context + + Returns: + Detected encoding string + + Raises: + utils.JobError: If encoding detection fails + """ + try: + file_encoding = subprocess.run( + ["uchardet", context.tmp], + check=True, + capture_output=True, + text=True, + ) + encoding = file_encoding.stdout.strip() + context.logger.info(f"Identified encoding of the file: {encoding}") + return encoding + except subprocess.CalledProcessError as e: + raise utils.JobError(f"Failed to detect file encoding: {e}") + + def _reencode_to_utf8( + self, context: ProcessingContext, from_encoding: str, output_file: str + ) -> None: + """ + Re-encode file to UTF-8 using iconv. + + Args: + context: Processing context + from_encoding: Source encoding + output_file: Output file path + + Raises: + utils.JobError: If re-encoding fails + """ + try: + cmd = subprocess.run( + ["iconv", "-f", from_encoding, "-t", "UTF-8", context.tmp], + capture_output=True, + check=True, + ) + with open(output_file, "wb") as f: + f.write(cmd.stdout) + context.logger.info("Successfully re-encoded to UTF-8") + except subprocess.CalledProcessError as e: + raise utils.JobError( + f"Job aborted as the file cannot be re-encoded to UTF-8. {e.stderr}" + ) diff --git a/ckanext/datapusher_plus/jobs/stages/formula.py b/ckanext/datapusher_plus/jobs/stages/formula.py new file mode 100644 index 0000000..ad9bb53 --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/formula.py @@ -0,0 +1,428 @@ +# -*- coding: utf-8 -*- +""" +Formula stage for the DataPusher Plus pipeline. + +Handles DRUF (Data Resource Update Formulae) processing using Jinja2. +""" + +import time +from typing import Dict, Any, Optional + +import ckanext.datapusher_plus.datastore_utils as dsu +import ckanext.datapusher_plus.jinja2_helpers as j2h +from ckanext.datapusher_plus.jobs.stages.base import BaseStage +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class FormulaStage(BaseStage): + """ + Processes DRUF formulae using Jinja2 templates. + + This stage is optional and requires the ckanext-scheming extension. + If scheming is not available, the stage will be skipped gracefully. + + Responsibilities: + - Fetch scheming YAML and package metadata + - Process package formulae (direct updates) + - Process resource formulae (direct updates) + - Process package suggestion formulae + - Process resource suggestion formulae + + DRUF formulae come in two types: + 1. "formula": Direct field updates (package/resource) + 2. "suggestion_formula": Populates suggestion popovers for data entry + """ + + def __init__(self): + super().__init__(name="FormulaProcessing") + + def should_skip(self, context: ProcessingContext) -> bool: + """ + Skip this stage if ckanext-scheming is not enabled in ckan.plugins. + + Args: + context: Processing context + + Returns: + True if scheming plugin is not enabled, False otherwise + """ + try: + # Check if scheming is in the ckan.plugins configuration + import ckan.plugins.toolkit as tk + + # Get the list of enabled plugins from config + plugins_config = tk.config.get('ckan.plugins', '') + enabled_plugins = [p.strip() for p in plugins_config.split()] + + # Check for scheming-related plugins + scheming_plugins = ['scheming_datasets', 'scheming_groups', + 'scheming_organizations', 'scheming'] + + if any(plugin in enabled_plugins for plugin in scheming_plugins): + return False # Scheming is enabled, don't skip + + # Scheming not enabled in config + context.logger.info( + "Skipping FormulaProcessing stage - ckanext-scheming not enabled in ckan.plugins" + ) + return True + + except Exception as e: + # If we can't read config, log and skip + context.logger.warning( + f"Unable to check ckan.plugins configuration: {e}. " + "Skipping FormulaProcessing stage." + ) + return True + + def process(self, context: ProcessingContext) -> ProcessingContext: + """ + Process DRUF formulae. + + Args: + context: Processing context + + Returns: + Updated context + + Raises: + Returns early (None) if critical errors occur + """ + formulae_start = time.perf_counter() + + # Fetch scheming YAML and package + package_id = context.resource["package_id"] + try: + scheming_yaml, package = dsu.get_scheming_yaml( + package_id, scheming_yaml_type="dataset" + ) + except Exception as e: + context.logger.warning( + f"Unable to fetch scheming YAML (scheming may not be configured): {e}" + ) + context.logger.info("Skipping formula processing") + return context # Skip formula processing but continue pipeline + + # Validate scheming YAML + if not scheming_yaml or not isinstance(scheming_yaml, dict): + context.logger.info("No valid scheming YAML found, skipping formula processing") + return context + + # Check for suggestion formulae + has_suggestion_formula = self._check_for_suggestion_formulae(scheming_yaml) + + if has_suggestion_formula: + context.logger.info("Found suggestion formulae in schema") + + # Validate and setup dpp_suggestions field + if not self._setup_dpp_suggestions(context, scheming_yaml, package): + return None # Critical error, abort + else: + context.logger.info("No suggestion formulae found") + + context.logger.log(5, f"package: {package}") + + # Get resource field stats (need to retrieve from context or pass in) + resource_fields_stats = self._get_resource_field_stats(context) + resource_fields_freqs = self._get_resource_field_freqs(context) + + # Initialize formula processor + formula_processor = j2h.FormulaProcessor( + scheming_yaml, + package, + context.resource, + resource_fields_stats, + resource_fields_freqs, + context.dataset_stats, + context.logger, + ) + + # Update status + package.setdefault("dpp_suggestions", {})[ + "STATUS" + ] = "STARTING FORMULAE PROCESSING..." + dsu.patch_package(package) + + # Clear LRU caches + self._clear_caches() + + # Process package formulae (direct updates) + package = self._process_package_formulae( + context, formula_processor, package + ) + + # Process resource formulae (direct updates) + self._process_resource_formulae(context, formula_processor) + + # Process package suggestion formulae + package = self._process_package_suggestions( + context, formula_processor, package, package_id + ) + + # Process resource suggestion formulae + package = self._process_resource_suggestions( + context, formula_processor, package, package_id + ) + + # Formulae processing complete + formulae_elapsed = time.perf_counter() - formulae_start + context.logger.info( + f"FORMULAE PROCESSING DONE! Processed in {formulae_elapsed:,.2f} seconds." + ) + + return context + + def _check_for_suggestion_formulae(self, scheming_yaml: Dict[str, Any]) -> bool: + """ + Check if scheming YAML contains suggestion formulae. + + Args: + scheming_yaml: Scheming YAML dictionary + + Returns: + True if suggestion formulae exist + """ + return any( + isinstance(field, dict) + and any(key.startswith("suggestion_formula") for key in field.keys()) + for field in scheming_yaml["dataset_fields"] + ) + + def _setup_dpp_suggestions( + self, + context: ProcessingContext, + scheming_yaml: Dict[str, Any], + package: Dict[str, Any], + ) -> bool: + """ + Validate and setup dpp_suggestions field. + + Args: + context: Processing context + scheming_yaml: Scheming YAML dictionary + package: Package dictionary + + Returns: + True if setup successful, False if critical error + """ + # Check if schema has dpp_suggestions field + schema_has_dpp_suggestions = any( + isinstance(field, dict) and field.get("field_name") == "dpp_suggestions" + for field in scheming_yaml["dataset_fields"] + ) + + if not schema_has_dpp_suggestions: + context.logger.error( + '"dpp_suggestions" field required but not found in your schema. ' + "Ensure that your scheming.yaml file contains the " + '"dpp_suggestions" field as a json_object.' + ) + return False + else: + context.logger.info('Found "dpp_suggestions" field in schema') + + # Add dpp_suggestions to package if missing + if "dpp_suggestions" not in package: + context.logger.warning( + 'Warning: "dpp_suggestions" field required to process Suggestion ' + "Formulae is not found in this package. " + 'Adding "dpp_suggestions" to package' + ) + + try: + package["dpp_suggestions"] = {} + dsu.patch_package(package) + context.logger.warning('"dpp_suggestions" field added to package') + except Exception as e: + context.logger.error(f'Error adding "dpp_suggestions" field {e}') + return False + + return True + + def _get_resource_field_stats(self, context: ProcessingContext) -> Dict[str, Any]: + """ + Get resource field statistics from context. + + Args: + context: Processing context + + Returns: + Resource field statistics dictionary + """ + return context.resource_fields_stats + + def _get_resource_field_freqs(self, context: ProcessingContext) -> Dict[str, Any]: + """ + Get resource field frequencies from context. + + Args: + context: Processing context + + Returns: + Resource field frequencies dictionary + """ + return context.resource_fields_freqs + + def _clear_caches(self) -> None: + """Clear LRU caches before processing formulae.""" + dsu.datastore_search.cache_clear() + dsu.datastore_search_sql.cache_clear() + dsu.datastore_info.cache_clear() + dsu.index_exists.cache_clear() + + def _process_package_formulae( + self, + context: ProcessingContext, + formula_processor: j2h.FormulaProcessor, + package: Dict[str, Any], + ) -> Dict[str, Any]: + """ + Process package formulae (direct updates). + + Args: + context: Processing context + formula_processor: Formula processor instance + package: Package dictionary + + Returns: + Updated package dictionary + """ + package_updates = formula_processor.process_formulae( + "package", "dataset_fields", "formula" + ) + + if package_updates: + package.update(package_updates) + status_msg = "PACKAGE formulae processed..." + package["dpp_suggestions"]["STATUS"] = status_msg + + try: + patched_package = dsu.patch_package(package) + context.logger.debug(f"Package after patching: {patched_package}") + package = patched_package + context.logger.info(status_msg) + except Exception as e: + context.logger.error(f"Error patching package: {str(e)}") + + return package + + def _process_resource_formulae( + self, + context: ProcessingContext, + formula_processor: j2h.FormulaProcessor, + ) -> None: + """ + Process resource formulae (direct updates). + + Args: + context: Processing context + formula_processor: Formula processor instance + """ + resource_updates = formula_processor.process_formulae( + "resource", "resource_fields", "formula" + ) + + if resource_updates: + context.resource.update(resource_updates) + status_msg = "RESOURCE formulae processed..." + + if context.resource.get("dpp_suggestions"): + context.resource["dpp_suggestions"]["STATUS"] = status_msg + else: + context.resource["dpp_suggestions"] = {"STATUS": status_msg} + + context.logger.info(status_msg) + + def _process_package_suggestions( + self, + context: ProcessingContext, + formula_processor: j2h.FormulaProcessor, + package: Dict[str, Any], + package_id: str, + ) -> Dict[str, Any]: + """ + Process package suggestion formulae. + + Args: + context: Processing context + formula_processor: Formula processor instance + package: Package dictionary + package_id: Package ID + + Returns: + Updated package dictionary + """ + package_suggestions = formula_processor.process_formulae( + "package", "dataset_fields", "suggestion_formula" + ) + + if package_suggestions: + context.logger.log(5, f"package_suggestions: {package_suggestions}") + revise_update_content = {"package": package_suggestions} + + try: + status_msg = "PACKAGE suggestion formulae processed..." + revise_update_content["STATUS"] = status_msg + revised_package = dsu.revise_package( + package_id, update={"dpp_suggestions": revise_update_content} + ) + context.logger.log(5, f"Package after revising: {revised_package}") + package = revised_package + context.logger.info(status_msg) + except Exception as e: + context.logger.error(f"Error revising package: {str(e)}") + + return package + + def _process_resource_suggestions( + self, + context: ProcessingContext, + formula_processor: j2h.FormulaProcessor, + package: Dict[str, Any], + package_id: str, + ) -> Dict[str, Any]: + """ + Process resource suggestion formulae. + + Note: Updates PACKAGE dpp_suggestions field, not resource. + + Args: + context: Processing context + formula_processor: Formula processor instance + package: Package dictionary + package_id: Package ID + + Returns: + Updated package dictionary + """ + resource_suggestions = formula_processor.process_formulae( + "resource", "resource_fields", "suggestion_formula" + ) + + if resource_suggestions: + context.logger.log(5, f"resource_suggestions: {resource_suggestions}") + resource_name = context.resource["name"] + revise_update_content = { + "resource": {resource_name: resource_suggestions} + } + + # Handle existing suggestions + if package.get("dpp_suggestions"): + package["dpp_suggestions"].update(revise_update_content["resource"]) + else: + package["dpp_suggestions"] = revise_update_content["resource"] + + try: + status_msg = "RESOURCE suggestion formulae processed..." + revise_update_content["STATUS"] = status_msg + + revised_package = dsu.revise_package( + package_id, update={"dpp_suggestions": revise_update_content} + ) + context.logger.log(5, f"Package after revising: {revised_package}") + package = revised_package + context.logger.info(status_msg) + except Exception as e: + context.logger.error(f"Error revising package: {str(e)}") + + return package diff --git a/ckanext/datapusher_plus/jobs/stages/indexing.py b/ckanext/datapusher_plus/jobs/stages/indexing.py new file mode 100644 index 0000000..7884535 --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/indexing.py @@ -0,0 +1,289 @@ +# -*- coding: utf-8 -*- +""" +Indexing stage for the DataPusher Plus pipeline. + +Handles automatic index creation based on cardinality and configuration. +""" + +import time +import psycopg2 +from psycopg2 import sql +from typing import List + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.config as conf +from ckanext.datapusher_plus.jobs.stages.base import BaseStage +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class IndexingStage(BaseStage): + """ + Creates database indexes automatically based on cardinality. + + Responsibilities: + - Create unique indexes for columns with all unique values + - Create regular indexes for low-cardinality columns + - Create indexes on date columns if configured + - Optimize table with VACUUM ANALYZE + """ + + def __init__(self): + super().__init__(name="Indexing") + + def should_skip(self, context: ProcessingContext) -> bool: + """ + Skip indexing if not configured. + + Args: + context: Processing context + + Returns: + True if indexing should be skipped + """ + # Get datetime columns (need to check if analysis stage stored this) + datetimecols_list = self._get_datetime_columns(context) + + return not ( + conf.AUTO_INDEX_THRESHOLD + or (conf.AUTO_INDEX_DATES and datetimecols_list) + or conf.AUTO_UNIQUE_INDEX + ) + + def process(self, context: ProcessingContext) -> ProcessingContext: + """ + Create database indexes. + + Args: + context: Processing context + + Returns: + Updated context + + Raises: + utils.JobError: If indexing fails + """ + index_start = time.perf_counter() + + # Get datetime columns + datetimecols_list = self._get_datetime_columns(context) + + context.logger.info( + f"AUTO-INDEXING. Auto-index threshold: {conf.AUTO_INDEX_THRESHOLD} " + f"unique value/s. Auto-unique index: {conf.AUTO_UNIQUE_INDEX} " + f"Auto-index dates: {conf.AUTO_INDEX_DATES} ..." + ) + + # Get cardinality data + headers_cardinality = context.dataset_stats.get("HEADERS_CARDINALITY", []) + record_count = context.dataset_stats.get("RECORD_COUNT", 0) + + # Adjust threshold if set to -1 (index all columns) + auto_index_threshold = conf.AUTO_INDEX_THRESHOLD + if auto_index_threshold == -1: + auto_index_threshold = record_count + + # Create indexes + index_count = self._create_indexes( + context, + headers_cardinality, + datetimecols_list, + record_count, + auto_index_threshold, + ) + + index_elapsed = time.perf_counter() - index_start + context.logger.info( + f'...indexing/vacuum analysis done. Indexed {index_count} column/s ' + f'in "{context.resource_id}" in {index_elapsed:,.2f} seconds.' + ) + + return context + + def _get_datetime_columns(self, context: ProcessingContext) -> List[str]: + """ + Extract datetime column names from headers_dicts. + + Args: + context: Processing context + + Returns: + List of datetime column names + """ + datetimecols_list = [] + for header in context.headers_dicts: + if header.get("type") == "timestamp": + datetimecols_list.append(header["id"]) + return datetimecols_list + + def _create_indexes( + self, + context: ProcessingContext, + headers_cardinality: List[int], + datetimecols_list: List[str], + record_count: int, + auto_index_threshold: int, + ) -> int: + """ + Create indexes on appropriate columns. + + Args: + context: Processing context + headers_cardinality: List of cardinality values for each column + datetimecols_list: List of datetime column names + record_count: Total number of records + auto_index_threshold: Cardinality threshold for indexing + + Returns: + Number of indexes created + + Raises: + utils.JobError: If database connection fails + """ + try: + raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL) + except psycopg2.Error as e: + raise utils.JobError(f"Could not connect to the Datastore: {e}") + + try: + index_cur = raw_connection.cursor() + index_count = 0 + + # Iterate through columns + for idx, cardinality in enumerate(headers_cardinality): + if idx >= len(context.headers): + break + + curr_col = context.headers[idx] + + # Check if we should create a unique index + if cardinality == record_count and conf.AUTO_UNIQUE_INDEX: + if self._create_unique_index( + context, index_cur, curr_col, cardinality + ): + index_count += 1 + + # Check if we should create a regular index + elif cardinality <= auto_index_threshold or ( + conf.AUTO_INDEX_DATES and (curr_col in datetimecols_list) + ): + if self._create_regular_index( + context, index_cur, curr_col, cardinality, datetimecols_list + ): + index_count += 1 + + index_cur.close() + raw_connection.commit() + + # VACUUM ANALYZE to optimize indexes + self._vacuum_analyze(context, raw_connection) + + return index_count + + finally: + if raw_connection: + raw_connection.close() + + def _create_unique_index( + self, + context: ProcessingContext, + cursor: psycopg2.extensions.cursor, + column: str, + cardinality: int, + ) -> bool: + """ + Create a unique index on a column. + + Args: + context: Processing context + cursor: Database cursor + column: Column name + cardinality: Column cardinality + + Returns: + True if index was created successfully, False otherwise + """ + if conf.PREVIEW_ROWS > 0: + unique_value_count = min(conf.PREVIEW_ROWS, cardinality) + else: + unique_value_count = cardinality + + context.logger.info( + f'Creating UNIQUE index on "{column}" for {unique_value_count} unique values...' + ) + + try: + cursor.execute( + sql.SQL("CREATE UNIQUE INDEX ON {} ({})").format( + sql.Identifier(context.resource_id), + sql.Identifier(column), + ) + ) + return True + except psycopg2.Error as e: + context.logger.warning(f'Could not CREATE UNIQUE INDEX on "{column}": {e}') + return False + + def _create_regular_index( + self, + context: ProcessingContext, + cursor: psycopg2.extensions.cursor, + column: str, + cardinality: int, + datetimecols_list: List[str], + ) -> bool: + """ + Create a regular index on a column. + + Args: + context: Processing context + cursor: Database cursor + column: Column name + cardinality: Column cardinality + datetimecols_list: List of datetime columns + + Returns: + True if index was created successfully, False otherwise + """ + if column in datetimecols_list: + context.logger.info( + f'Creating index on "{column}" date column for {cardinality} unique value/s...' + ) + else: + context.logger.info( + f'Creating index on "{column}" for {cardinality} unique value/s...' + ) + + try: + cursor.execute( + sql.SQL("CREATE INDEX ON {} ({})").format( + sql.Identifier(context.resource_id), + sql.Identifier(column), + ) + ) + return True + except psycopg2.Error as e: + context.logger.warning(f'Could not CREATE INDEX on "{column}": {e}') + return False + + def _vacuum_analyze( + self, context: ProcessingContext, connection: psycopg2.extensions.connection + ) -> None: + """ + Run VACUUM ANALYZE to optimize indexes. + + Args: + context: Processing context + connection: Database connection + """ + context.logger.info("Vacuum Analyzing table to optimize indices...") + + connection.set_isolation_level( + psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT + ) + analyze_cur = connection.cursor() + try: + analyze_cur.execute( + sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(context.resource_id)) + ) + finally: + analyze_cur.close() diff --git a/ckanext/datapusher_plus/jobs/stages/metadata.py b/ckanext/datapusher_plus/jobs/stages/metadata.py new file mode 100644 index 0000000..e24ec5c --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/metadata.py @@ -0,0 +1,391 @@ +# -*- coding: utf-8 -*- +""" +Metadata stage for the DataPusher Plus pipeline. + +Handles resource metadata updates, auto-aliasing, and summary statistics. +""" + +import os +import time +import psycopg2 +from psycopg2 import sql +from typing import Optional + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.config as conf +import ckanext.datapusher_plus.datastore_utils as dsu +from ckanext.datapusher_plus.jobs.stages.base import BaseStage +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class MetadataStage(BaseStage): + """ + Updates resource metadata and creates aliases. + + Responsibilities: + - Create auto-aliases for resources + - Create summary statistics resource + - Update resource metadata (datastore_active, record counts, etc.) + - Set final aliases and calculate record counts + """ + + def __init__(self): + super().__init__(name="MetadataUpdate") + + def process(self, context: ProcessingContext) -> ProcessingContext: + """ + Update resource metadata. + + Args: + context: Processing context + + Returns: + Updated context + + Raises: + utils.JobError: If metadata update fails + """ + metadata_start = time.perf_counter() + context.logger.info("UPDATING RESOURCE METADATA...") + + # Connect to database for aliasing operations + try: + raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL) + except psycopg2.Error as e: + raise utils.JobError(f"Could not connect to the Datastore: {e}") + + try: + cur = raw_connection.cursor() + + # Create auto-alias if configured + alias = self._create_auto_alias(context, cur) + + # Create summary statistics resource if configured + self._create_summary_stats_resource(context, cur) + + # Commit database changes + cur.close() + raw_connection.commit() + + finally: + if raw_connection: + raw_connection.close() + + # Update resource metadata + self._update_resource_metadata(context) + + # Set alias and calculate record count + dsu.send_resource_to_datastore( + resource=None, + resource_id=context.resource["id"], + headers=context.headers_dicts, + records=None, + aliases=alias, + calculate_record_count=True, + ) + + if alias: + context.logger.info(f'Created alias "{alias}" for "{context.resource_id}"...') + + metadata_elapsed = time.perf_counter() - metadata_start + context.logger.info( + f"RESOURCE METADATA UPDATES DONE! Resource metadata updated in " + f"{metadata_elapsed:,.2f} seconds." + ) + + # Mark as done + package = dsu.get_package(context.resource["package_id"]) + package.setdefault("dpp_suggestions", {})["STATUS"] = "DONE" + dsu.patch_package(package) + + return context + + def _create_auto_alias( + self, context: ProcessingContext, cursor: psycopg2.extensions.cursor + ) -> Optional[str]: + """ + Create auto-alias for the resource. + + Args: + context: Processing context + cursor: Database cursor + + Returns: + Alias name if created, None otherwise + """ + if not conf.AUTO_ALIAS: + return None + + context.logger.info( + f"AUTO-ALIASING. Auto-alias-unique: {conf.AUTO_ALIAS_UNIQUE} ..." + ) + + # Get package info for alias construction + package = dsu.get_package(context.resource["package_id"]) + + resource_name = context.resource.get("name") + package_name = package.get("name") + owner_org = package.get("organization") + owner_org_name = owner_org.get("name") if owner_org else "" + + if not (resource_name and package_name and owner_org_name): + context.logger.warning( + f"Cannot create alias: {resource_name}-{package_name}-{owner_org}" + ) + return None + + # Create base alias (limited to 55 chars for sequence/stats suffix) + alias = f"{resource_name}-{package_name}-{owner_org_name}"[:55] + + # Check if alias exists + cursor.execute( + "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of", + (alias + "%",), + ) + alias_query_result = cursor.fetchone() + + if alias_query_result: + alias_count = alias_query_result[0] + existing_alias_of = alias_query_result[1] + else: + alias_count = 0 + existing_alias_of = "" + + # Handle alias uniqueness + if conf.AUTO_ALIAS_UNIQUE and alias_count > 1: + alias_sequence = alias_count + 1 + while True: + # Find next available sequence number + alias = f"{alias}-{alias_sequence:03}" + cursor.execute( + "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of;", + (alias + "%",), + ) + result = cursor.fetchone() + alias_exists = result[0] if result else 0 + if not alias_exists: + break + alias_sequence += 1 + elif alias_count == 1: + # Drop existing alias + context.logger.warning( + f'Dropping existing alias "{alias}" for resource "{existing_alias_of}"...' + ) + try: + cursor.execute( + sql.SQL("DROP VIEW IF EXISTS {}").format(sql.Identifier(alias)) + ) + except psycopg2.Error as e: + context.logger.warning(f"Could not drop alias/view: {e}") + + return alias + + def _create_summary_stats_resource( + self, context: ProcessingContext, cursor: psycopg2.extensions.cursor + ) -> None: + """ + Create summary statistics resource. + + Args: + context: Processing context + cursor: Database cursor + + Raises: + utils.JobError: If stats resource creation fails + """ + # Check if we should create summary stats + if not (conf.ADD_SUMMARY_STATS_RESOURCE or conf.SUMMARY_STATS_WITH_PREVIEW): + return + + record_count = context.dataset_stats.get("RECORD_COUNT", 0) + if not (conf.PREVIEW_ROWS == 0 or conf.SUMMARY_STATS_WITH_PREVIEW): + # Skip if preview mode and not explicitly enabled + return + + stats_resource_id = context.resource_id + "-stats" + + # Delete existing stats resource + self._delete_existing_stats(context, cursor, stats_resource_id) + + # Prepare aliases for stats resource + stats_aliases = [stats_resource_id] + if conf.AUTO_ALIAS: + # Get base alias from main resource + package = dsu.get_package(context.resource["package_id"]) + resource_name = context.resource.get("name") + package_name = package.get("name") + owner_org = package.get("organization") + owner_org_name = owner_org.get("name") if owner_org else "" + base_alias = f"{resource_name}-{package_name}-{owner_org_name}"[:55] + + auto_alias_stats_id = base_alias + "-stats" + stats_aliases.append(auto_alias_stats_id) + + # Delete existing auto-aliased stats + self._delete_existing_stats(context, cursor, auto_alias_stats_id) + + # Infer stats schema + qsv_stats_csv = os.path.join(context.temp_dir, "qsv_stats.csv") + stats_stats_dict = self._infer_stats_schema(context, qsv_stats_csv) + + # Create stats resource + resource_name = context.resource.get("name") + stats_resource = { + "package_id": context.resource["package_id"], + "name": resource_name + " - Summary Statistics", + "format": "CSV", + "mimetype": "text/csv", + } + + stats_response = dsu.send_resource_to_datastore( + stats_resource, + resource_id=None, + headers=stats_stats_dict, + records=None, + aliases=stats_aliases, + calculate_record_count=False, + ) + + context.logger.info(f"stats_response: {stats_response}") + + new_stats_resource_id = stats_response["result"]["resource_id"] + + # Copy stats data to datastore + self._copy_stats_to_datastore( + context, cursor, qsv_stats_csv, new_stats_resource_id, stats_stats_dict + ) + + # Update stats resource metadata + stats_resource["id"] = new_stats_resource_id + stats_resource["summary_statistics"] = True + stats_resource["summary_of_resource"] = context.resource_id + dsu.update_resource(stats_resource) + + def _delete_existing_stats( + self, + context: ProcessingContext, + cursor: psycopg2.extensions.cursor, + stats_id: str, + ) -> None: + """ + Delete existing stats resource if it exists. + + Args: + context: Processing context + cursor: Database cursor + stats_id: Stats resource ID or alias + """ + existing_stats = dsu.datastore_resource_exists(stats_id) + if existing_stats: + context.logger.info(f'Deleting existing summary stats "{stats_id}".') + + cursor.execute( + "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;", + (stats_id + "%",), + ) + stats_alias_result = cursor.fetchone() + + if stats_alias_result: + existing_stats_alias_of = stats_alias_result[0] + dsu.delete_datastore_resource(existing_stats_alias_of) + dsu.delete_resource(existing_stats_alias_of) + + def _infer_stats_schema( + self, context: ProcessingContext, qsv_stats_csv: str + ) -> list: + """ + Infer schema for stats CSV. + + Args: + context: Processing context + qsv_stats_csv: Path to stats CSV + + Returns: + List of stats field dictionaries + + Raises: + utils.JobError: If schema inference fails + """ + try: + qsv_stats_stats = context.qsv.stats( + qsv_stats_csv, + typesonly=True, + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot run stats on CSV stats: {e}") + + stats_stats = str(qsv_stats_stats.stdout).strip() + stats_stats_dict = [ + dict(id=ele.split(",")[0], type=conf.TYPE_MAPPING[ele.split(",")[1]]) + for idx, ele in enumerate(stats_stats.splitlines()[1:], 1) + ] + + context.logger.info(f"stats_stats_dict: {stats_stats_dict}") + + return stats_stats_dict + + def _copy_stats_to_datastore( + self, + context: ProcessingContext, + cursor: psycopg2.extensions.cursor, + qsv_stats_csv: str, + stats_resource_id: str, + stats_stats_dict: list, + ) -> None: + """ + Copy stats data to datastore. + + Args: + context: Processing context + cursor: Database cursor + qsv_stats_csv: Path to stats CSV + stats_resource_id: Stats resource ID + stats_stats_dict: Stats schema + + Raises: + utils.JobError: If COPY fails + """ + col_names_list = [h["id"] for h in stats_stats_dict] + stats_aliases_str = f"{stats_resource_id}, ..." + + context.logger.info( + f'ADDING SUMMARY STATISTICS {col_names_list} in "{stats_resource_id}" ' + f'with alias/es "{stats_aliases_str}"...' + ) + + column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list) + + copy_sql = sql.SQL( + "COPY {} ({}) FROM STDIN WITH (FORMAT CSV, HEADER 1, ENCODING 'UTF8');" + ).format( + sql.Identifier(stats_resource_id), + column_names, + ) + + with open(qsv_stats_csv, "rb") as f: + try: + cursor.copy_expert(copy_sql, f) + except psycopg2.Error as e: + raise utils.JobError(f"Postgres COPY failed: {e}") + + def _update_resource_metadata(self, context: ProcessingContext) -> None: + """ + Update resource metadata fields. + + Args: + context: Processing context + """ + record_count = context.dataset_stats.get("RECORD_COUNT", 0) + + context.resource["datastore_active"] = True + context.resource["total_record_count"] = record_count + + if conf.PREVIEW_ROWS < record_count or (conf.PREVIEW_ROWS > 0): + context.resource["preview"] = True + context.resource["preview_rows"] = context.copied_count + else: + context.resource["preview"] = False + context.resource["preview_rows"] = None + context.resource["partial_download"] = False + + dsu.update_resource(context.resource) diff --git a/ckanext/datapusher_plus/jobs/stages/validation.py b/ckanext/datapusher_plus/jobs/stages/validation.py new file mode 100644 index 0000000..7f64018 --- /dev/null +++ b/ckanext/datapusher_plus/jobs/stages/validation.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +""" +Validation stage for the DataPusher Plus pipeline. + +Handles CSV validation and deduplication. +""" + +import os +import json +import subprocess +from typing import Dict, Any, Union + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.config as conf +from ckanext.datapusher_plus.jobs.stages.base import BaseStage +from ckanext.datapusher_plus.jobs.context import ProcessingContext + + +class ValidationStage(BaseStage): + """ + Validates CSV file and performs deduplication. + + Responsibilities: + - Validate CSV against RFC4180 standard + - Check if CSV is sorted + - Count duplicates + - Deduplicate if needed + """ + + def __init__(self): + super().__init__(name="Validation") + + def process(self, context: ProcessingContext) -> ProcessingContext: + """ + Validate CSV and deduplicate if needed. + + Args: + context: Processing context + + Returns: + Updated context + + Raises: + utils.JobError: If validation fails + """ + # Validate CSV + self._validate_csv(context) + + # Check for duplicates and sort order + dupe_count = 0 + if conf.SORT_AND_DUPE_CHECK or conf.DEDUP: + dupe_count = self._check_duplicates(context) + + # Deduplicate if needed + if conf.DEDUP and dupe_count > 0: + self._deduplicate(context, dupe_count) + else: + context.add_stat("DEDUPED", False) + + return context + + def _validate_csv(self, context: ProcessingContext) -> None: + """ + Validate CSV against RFC4180 standard. + + Args: + context: Processing context + + Raises: + utils.JobError: If CSV is invalid + """ + context.logger.info("Validating CSV...") + try: + context.qsv.validate(context.tmp) + except utils.JobError as e: + raise utils.JobError(f"qsv validate failed: {e}") + + context.logger.info("Well-formed, valid CSV file confirmed...") + + def _check_duplicates(self, context: ProcessingContext) -> int: + """ + Check for duplicates and if CSV is sorted. + + Args: + context: Processing context + + Returns: + Number of duplicates found + + Raises: + utils.JobError: If sortcheck fails + """ + context.logger.info("Checking for duplicates and if the CSV is sorted...") + + try: + qsv_sortcheck = context.qsv.sortcheck( + context.tmp, json_output=True, uses_stdio=True + ) + except utils.JobError as e: + raise utils.JobError( + f"Failed to check if CSV is sorted and has duplicates: {e}" + ) + + # Parse sortcheck output + sortcheck_json = self._parse_sortcheck_output(qsv_sortcheck) + + # Extract and store statistics + is_sorted = bool(sortcheck_json.get("sorted", False)) + record_count = int(sortcheck_json.get("record_count", 0)) + unsorted_breaks = int(sortcheck_json.get("unsorted_breaks", 0)) + dupe_count = int(sortcheck_json.get("dupe_count", 0)) + + context.add_stat("IS_SORTED", is_sorted) + context.add_stat("RECORD_COUNT", record_count) + context.add_stat("UNSORTED_BREAKS", unsorted_breaks) + context.add_stat("DUPE_COUNT", dupe_count) + + # Format log message + sortcheck_msg = f"Sorted: {is_sorted}; Unsorted breaks: {unsorted_breaks:,}" + if is_sorted and dupe_count > 0: + sortcheck_msg = f"{sortcheck_msg}; Duplicates: {dupe_count:,}" + + context.logger.info(sortcheck_msg) + + return dupe_count + + def _parse_sortcheck_output( + self, qsv_sortcheck: Union[subprocess.CompletedProcess, Dict[str, Any]] + ) -> Dict[str, Any]: + """ + Parse sortcheck JSON output. + + Args: + qsv_sortcheck: Output from qsv sortcheck command + + Returns: + Parsed JSON dictionary + + Raises: + utils.JobError: If parsing fails + """ + try: + # Handle both subprocess.CompletedProcess and dict outputs + stdout_content = ( + qsv_sortcheck.stdout + if hasattr(qsv_sortcheck, "stdout") + else qsv_sortcheck.get("stdout") + ) + sortcheck_json = json.loads(str(stdout_content)) + except (json.JSONDecodeError, AttributeError) as e: + raise utils.JobError(f"Failed to parse sortcheck JSON output: {e}") + + # Validate required fields + try: + # Ensure numeric values are valid + int(sortcheck_json.get("record_count", 0)) + int(sortcheck_json.get("unsorted_breaks", 0)) + int(sortcheck_json.get("dupe_count", 0)) + except (ValueError, TypeError) as e: + raise utils.JobError(f"Invalid numeric value in sortcheck output: {e}") + + return sortcheck_json + + def _deduplicate(self, context: ProcessingContext, dupe_count: int) -> None: + """ + Deduplicate the CSV file. + + Args: + context: Processing context + dupe_count: Number of duplicates found + + Raises: + utils.JobError: If deduplication fails + """ + qsv_dedup_csv = os.path.join(context.temp_dir, "qsv_dedup.csv") + context.logger.info(f"{dupe_count} duplicate rows found. Deduping...") + + try: + context.qsv.extdedup(context.tmp, qsv_dedup_csv) + except utils.JobError as e: + raise utils.JobError(f"Check for duplicates error: {e}") + + context.add_stat("DEDUPED", True) + context.update_tmp(qsv_dedup_csv) + context.logger.info(f"Deduped CSV saved to {qsv_dedup_csv}") diff --git a/ckanext/datapusher_plus/jobs/utils/__init__.py b/ckanext/datapusher_plus/jobs/utils/__init__.py new file mode 100644 index 0000000..4248b9e --- /dev/null +++ b/ckanext/datapusher_plus/jobs/utils/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +""" +Utility modules for the DataPusher Plus job processing pipeline. +""" + +__all__ = [] diff --git a/ckanext/datapusher_plus/jobs_legacy.py b/ckanext/datapusher_plus/jobs_legacy.py new file mode 100644 index 0000000..23f57bd --- /dev/null +++ b/ckanext/datapusher_plus/jobs_legacy.py @@ -0,0 +1,1623 @@ +# -*- coding: utf-8 -*- +# flake8: noqa: E501 + +# Standard library imports +import csv +import hashlib +import locale +import mimetypes +import os +import subprocess +import tempfile +import time +from urllib.parse import urlsplit, urlparse +import logging +import uuid +import sys +import json +import requests +from pathlib import Path +from typing import Dict, Any, Optional, List + +# Third-party imports +import psycopg2 +from psycopg2 import sql +from datasize import DataSize +from dateutil.parser import parse as parsedate +import traceback +import sqlalchemy as sa +from rq import get_current_job + +import ckanext.datapusher_plus.utils as utils +import ckanext.datapusher_plus.helpers as dph +import ckanext.datapusher_plus.jinja2_helpers as j2h +from ckanext.datapusher_plus.job_exceptions import HTTPError +import ckanext.datapusher_plus.config as conf +import ckanext.datapusher_plus.spatial_helpers as sh +import ckanext.datapusher_plus.datastore_utils as dsu +from ckanext.datapusher_plus.logging_utils import TRACE +from ckanext.datapusher_plus.qsv_utils import QSVCommand +from ckanext.datapusher_plus.pii_screening import screen_for_pii + +if locale.getdefaultlocale()[0]: + lang, encoding = locale.getdefaultlocale() + locale.setlocale(locale.LC_ALL, locale=(lang, encoding)) +else: + locale.setlocale(locale.LC_ALL, "") + + +def validate_input(input: Dict[str, Any]) -> None: + # Especially validate metadata which is provided by the user + if "metadata" not in input: + raise utils.JobError("Metadata missing") + + data = input["metadata"] + + if "resource_id" not in data: + raise utils.JobError("No id provided.") + + +def callback_datapusher_hook(result_url: str, job_dict: Dict[str, Any]) -> bool: + api_token = utils.get_dp_plus_user_apitoken() + headers: Dict[str, str] = { + "Content-Type": "application/json", + "Authorization": api_token, + } + + try: + result = requests.post( + result_url, + data=json.dumps(job_dict, cls=utils.DatetimeJsonEncoder), + verify=conf.SSL_VERIFY, + headers=headers, + ) + except requests.ConnectionError: + return False + + return result.status_code == requests.codes.ok + + +def datapusher_plus_to_datastore(input: Dict[str, Any]) -> Optional[str]: + """ + This is the main function that is called by the datapusher_plus worker + + Errors are caught and logged in the database + + Args: + input: Dictionary containing metadata and other job information + + Returns: + Optional[str]: Returns "error" if there was an error, None otherwise + """ + job_dict: Dict[str, Any] = dict(metadata=input["metadata"], status="running") + callback_datapusher_hook(result_url=input["result_url"], job_dict=job_dict) + + job_id = get_current_job().id + errored = False + try: + push_to_datastore(input, job_id) + job_dict["status"] = "complete" + dph.mark_job_as_completed(job_id, job_dict) + except utils.JobError as e: + dph.mark_job_as_errored(job_id, str(e)) + job_dict["status"] = "error" + job_dict["error"] = str(e) + log = logging.getLogger(__name__) + log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}") + errored = True + except Exception as e: + dph.mark_job_as_errored( + job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e) + ) + job_dict["status"] = "error" + job_dict["error"] = str(e) + log = logging.getLogger(__name__) + log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}") + errored = True + finally: + # job_dict is defined in datapusher_hook's docstring + is_saved_ok = callback_datapusher_hook( + result_url=input["result_url"], job_dict=job_dict + ) + errored = errored or not is_saved_ok + return "error" if errored else None + + +def push_to_datastore( + input: Dict[str, Any], task_id: str, dry_run: bool = False +) -> Optional[List[Dict[str, Any]]]: + """Download and parse a resource push its data into CKAN's DataStore. + + An asynchronous job that gets a resource from CKAN, downloads the + resource's data file and, if the data file has changed since last time, + parses the data and posts it into CKAN's DataStore. + + Args: + input: Dictionary containing metadata and other job information + task_id: Unique identifier for the task + dry_run: If True, fetch and parse the data file but don't actually post the + data to the DataStore, instead return the data headers and rows that + would have been posted. + + Returns: + Optional[List[Dict[str, Any]]]: If dry_run is True, returns the headers and rows + that would have been posted. Otherwise returns None. + """ + # Ensure temporary files are removed after run + with tempfile.TemporaryDirectory() as temp_dir: + return _push_to_datastore(task_id, input, dry_run=dry_run, temp_dir=temp_dir) + + +def _push_to_datastore( + task_id: str, + input: Dict[str, Any], + dry_run: bool = False, + temp_dir: Optional[str] = None, +) -> Optional[List[Dict[str, Any]]]: + # add job to dn (datapusher_plus_jobs table) + try: + dph.add_pending_job(task_id, **input) + except sa.exc.IntegrityError: + raise utils.JobError("Job already exists.") + handler = utils.StoringHandler(task_id, input) + logger = logging.getLogger(task_id) + logger.addHandler(handler) + + # also show logs on stderr + logger.addHandler(logging.StreamHandler()) + + # set the log level to the config upload_log_level + try: + log_level = getattr(logging, conf.UPLOAD_LOG_LEVEL.upper()) + except AttributeError: + # fallback to our custom TRACE level + log_level = TRACE + + # set the log level to the config upload_log_level + logger.setLevel(logging.INFO) + logger.info(f"Setting log level to {logging.getLevelName(int(log_level))}") + logger.setLevel(log_level) + + # check if conf.QSV_BIN exists + if not Path(conf.QSV_BIN).is_file(): + raise utils.JobError(f"{conf.QSV_BIN} not found.") + + # Initialize QSVCommand + qsv = QSVCommand(logger=logger) + + validate_input(input) + + data = input["metadata"] + + ckan_url = data["ckan_url"] + resource_id = data["resource_id"] + try: + resource = dsu.get_resource(resource_id) + except utils.JobError: + # try again in 5 seconds just incase CKAN is slow at adding resource + time.sleep(5) + resource = dsu.get_resource(resource_id) + + # check if the resource url_type is a datastore + if resource.get("url_type") == "datastore": + logger.info("Dump files are managed with the Datastore API") + return + + # check scheme + resource_url = resource.get("url") + scheme = urlsplit(resource_url).scheme + if scheme not in ("http", "https", "ftp"): + raise utils.JobError("Only http, https, and ftp resources may be fetched.") + + # ========================================================================== + # DOWNLOAD + # ========================================================================== + timer_start = time.perf_counter() + dataset_stats = {} + + # fetch the resource data + logger.info(f"Fetching from: {resource_url}...") + headers: Dict[str, str] = {} + if resource.get("url_type") == "upload": + # If this is an uploaded file to CKAN, authenticate the request, + # otherwise we won't get file from private resources + api_token = utils.get_dp_plus_user_apitoken() + headers["Authorization"] = api_token + + # If the ckan_url differs from this url, rewrite this url to the ckan + # url. This can be useful if ckan is behind a firewall. + if not resource_url.startswith(ckan_url): + new_url = urlparse(resource_url) + rewrite_url = urlparse(ckan_url) + new_url = new_url._replace( + scheme=rewrite_url.scheme, netloc=rewrite_url.netloc + ) + resource_url = new_url.geturl() + logger.info(f"Rewritten resource url to: {resource_url}") + + try: + kwargs: Dict[str, Any] = { + "headers": headers, + "timeout": conf.TIMEOUT, + "verify": conf.SSL_VERIFY, + "stream": True, + } + if conf.USE_PROXY: + kwargs["proxies"] = { + "http": conf.DOWNLOAD_PROXY, + "https": conf.DOWNLOAD_PROXY, + } + with requests.get(resource_url, **kwargs) as response: + response.raise_for_status() + + cl = response.headers.get("content-length") + max_content_length = conf.MAX_CONTENT_LENGTH + ct = response.headers.get("content-type") + + try: + if cl and int(cl) > max_content_length and conf.PREVIEW_ROWS > 0: + raise utils.JobError( + f"Resource too large to download: {DataSize(int(cl)):.2MB} > max ({DataSize(int(max_content_length)):.2MB})." + ) + except ValueError: + pass + + resource_format = resource.get("format").upper() + + # if format was not specified, try to get it from mime type + if not resource_format: + logger.info("File format: NOT SPECIFIED") + # if we have a mime type, get the file extension from the response header + if ct: + resource_format = mimetypes.guess_extension(ct.split(";")[0]) + + if resource_format is None: + raise utils.JobError( + "Cannot determine format from mime type. Please specify format." + ) + logger.info(f"Inferred file format: {resource_format}") + else: + raise utils.JobError( + "Server did not return content-type. Please specify format." + ) + else: + logger.info(f"File format: {resource_format}") + + tmp = os.path.join(temp_dir, "tmp." + resource_format) + length = 0 + # using MD5 for file deduplication only + # no need for it to be cryptographically secure + m = hashlib.md5() # DevSkim: ignore DS126858 + + # download the file + if cl: + logger.info(f"Downloading {DataSize(int(cl)):.2MB} file...") + else: + logger.info("Downloading file of unknown size...") + + with open(tmp, "wb") as tmp_file: + for chunk in response.iter_content(conf.CHUNK_SIZE): + length += len(chunk) + if length > max_content_length and not conf.PREVIEW_ROWS: + raise utils.JobError( + f"Resource too large to process: {length} > max ({max_content_length})." + ) + tmp_file.write(chunk) + m.update(chunk) + + except requests.HTTPError as e: + raise HTTPError( + f"DataPusher+ received a bad HTTP response when trying to download " + f"the data file from {resource_url}. Status code: {e.response.status_code}, " + f"Response content: {e.response.content}", + status_code=e.response.status_code, + request_url=resource_url, + response=e.response.content, + ) + except requests.RequestException as e: + raise HTTPError( + message=str(e), + status_code=None, + request_url=resource_url, + response=None, + ) + + file_hash = m.hexdigest() + dataset_stats["ORIGINAL_FILE_SIZE"] = length + + # check if the resource metadata (like data dictionary data types) + # has been updated since the last fetch + resource_updated = False + resource_last_modified = resource.get("last_modified") + if resource_last_modified: + resource_last_modified = parsedate(resource_last_modified) + file_last_modified = response.headers.get("last-modified") + if file_last_modified: + file_last_modified = parsedate(file_last_modified).replace(tzinfo=None) + if file_last_modified < resource_last_modified: + resource_updated = True + + if ( + resource.get("hash") == file_hash + and not data.get("ignore_hash") + and not conf.IGNORE_FILE_HASH + and not resource_updated + ): + logger.warning(f"Upload skipped as the file hash hasn't changed: {file_hash}.") + return + + resource["hash"] = file_hash + + fetch_elapsed = time.perf_counter() - timer_start + logger.info( + f"Fetched {DataSize(length):.2MB} file in {fetch_elapsed:,.2f} seconds." + ) + + # Check if the file is a zip file + unzipped_format = "" + if resource_format.upper() == "ZIP": + logger.info("Processing ZIP file...") + + file_count, extracted_path, unzipped_format = dph.extract_zip_or_metadata( + tmp, temp_dir, logger + ) + if not file_count: + logger.error("ZIP file invalid or no files found in ZIP file.") + return + logger.info( + f"More than one file in the ZIP file ({file_count} files), saving metadata..." + if file_count > 1 + else f"Extracted {unzipped_format} file: {extracted_path}" + ) + tmp = extracted_path + + # =================================================================================== + # ANALYZE WITH QSV + # =================================================================================== + # Start Analysis using qsv instead of messytables, as + # 1) its type inferences are bullet-proof not guesses as it scans the entire file, + # 2) its super-fast, and + # 3) it has addl data-wrangling capabilities we use in DP+ (e.g. stats, dedup, etc.) + dupe_count = 0 + record_count = 0 + analysis_start = time.perf_counter() + logger.info("ANALYZING WITH QSV..") + + # flag to check if the file is a spatial format + spatial_format_flag = False + simplification_failed_flag = False + # ----------------- is it a spreadsheet? --------------- + # check content type or file extension if its a spreadsheet + spreadsheet_extensions = ["XLS", "XLSX", "ODS", "XLSM", "XLSB"] + file_format = resource.get("format").upper() + if ( + file_format in spreadsheet_extensions + or unzipped_format in spreadsheet_extensions + ): + # if so, export spreadsheet as a CSV file + default_excel_sheet = conf.DEFAULT_EXCEL_SHEET + file_format = unzipped_format if unzipped_format != "" else file_format + logger.info(f"Converting {file_format} sheet {default_excel_sheet} to CSV...") + # first, we need a temporary spreadsheet filename with the right file extension + # we only need the filename though, that's why we remove it + # and create a hardlink to the file we got from CKAN + qsv_spreadsheet = os.path.join(temp_dir, "qsv_spreadsheet." + file_format) + os.link(tmp, qsv_spreadsheet) + + # run `qsv excel` and export it to a CSV + # use --trim option to trim column names and the data + qsv_excel_csv = os.path.join(temp_dir, "qsv_excel.csv") + try: + qsv_excel = qsv.excel( + qsv_spreadsheet, + sheet=default_excel_sheet, + trim=True, + output_file=qsv_excel_csv, + ) + except utils.JobError as e: + raise utils.JobError( + f"Upload aborted. Cannot export spreadsheet(?) to CSV: {e}" + ) + excel_export_msg = qsv_excel.stderr + logger.info(f"{excel_export_msg}...") + tmp = qsv_excel_csv + elif resource_format.upper() in ["SHP", "QGIS", "GEOJSON"]: + logger.info("SHAPEFILE or GEOJSON file detected...") + + qsv_spatial_file = os.path.join( + temp_dir, + "qsv_spatial_" + str(uuid.uuid4()) + "." + resource_format, + ) + os.link(tmp, qsv_spatial_file) + qsv_spatial_csv = os.path.join(temp_dir, "qsv_spatial.csv") + + if conf.AUTO_SPATIAL_SIMPLIFICATION: + # Try to convert spatial file to CSV using spatial_helpers + logger.info( + f"Converting spatial file to CSV with a simplification relative tolerance of {conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE}..." + ) + + try: + # Use the convert_to_csv function from spatial_helpers + success, error_message, bounds = sh.process_spatial_file( + qsv_spatial_file, + resource_format, + output_csv_path=qsv_spatial_csv, + tolerance=conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE, + task_logger=logger, + ) + + if success: + logger.info( + "Spatial file successfully simplified and converted to CSV" + ) + tmp = qsv_spatial_csv + + # Check if the simplified resource already exists + simplified_resource_name = ( + os.path.splitext(resource["name"])[0] + + "_simplified" + + os.path.splitext(resource["name"])[1] + ) + existing_resource, existing_resource_id = dsu.resource_exists( + resource["package_id"], simplified_resource_name + ) + + if existing_resource: + logger.info( + "Simplified resource already exists. Replacing it..." + ) + dsu.delete_resource(existing_resource_id) + else: + logger.info( + "Simplified resource does not exist. Uploading it..." + ) + new_simplified_resource = { + "package_id": resource["package_id"], + "name": os.path.splitext(resource["name"])[0] + + "_simplified" + + os.path.splitext(resource["name"])[1], + "url": "", + "format": resource["format"], + "hash": "", + "mimetype": resource["mimetype"], + "mimetype_inner": resource["mimetype_inner"], + } + + # Add bounds information if available + if bounds: + minx, miny, maxx, maxy = bounds + new_simplified_resource.update( + { + "dpp_spatial_extent": { + "type": "BoundingBox", + "coordinates": [ + [minx, miny], + [maxx, maxy], + ], + } + } + ) + logger.info( + f"Added dpp_spatial_extent to resource metadata: {bounds}" + ) + + dsu.upload_resource(new_simplified_resource, qsv_spatial_file) + + # delete the simplified spatial file + os.remove(qsv_spatial_file) + + simplification_failed_flag = False + else: + logger.warning( + f"Upload of simplified spatial file failed: {error_message}" + ) + simplification_failed_flag = True + except Exception as e: + logger.warning(f"Simplification and conversion failed: {str(e)}") + logger.warning( + f"Simplification and conversion failed. Using qsv geoconvert to convert to CSV, truncating large columns to {conf.QSV_STATS_STRING_MAX_LENGTH} characters..." + ) + simplification_failed_flag = True + + # If we are not auto-simplifying or simplification failed, use qsv geoconvert + if not conf.AUTO_SPATIAL_SIMPLIFICATION or simplification_failed_flag: + logger.info("Converting spatial file to CSV using qsv geoconvert...") + + # Run qsv geoconvert + qsv_geoconvert_csv = os.path.join(temp_dir, "qsv_geoconvert.csv") + try: + qsv.geoconvert( + tmp, + resource_format, + "csv", + max_length=conf.QSV_STATS_STRING_MAX_LENGTH, + output_file=qsv_geoconvert_csv, + ) + except utils.JobError as e: + raise utils.JobError(f"qsv geoconvert failed: {e}") + + tmp = qsv_geoconvert_csv + logger.info("Geoconverted successfully") + + else: + # --- its not a spreadsheet nor a spatial format, its a CSV/TSV/TAB file ------ + # Normalize & transcode to UTF-8 using `qsv input`. We need to normalize as + # it could be a CSV/TSV/TAB dialect with differing delimiters, quoting, etc. + # Using qsv input's --output option also auto-transcodes to UTF-8. + # Note that we only change the workfile, the resource file itself is unchanged. + + # ------------------- Normalize to CSV --------------------- + qsv_input_csv = os.path.join(temp_dir, "qsv_input.csv") + # if resource_format is CSV we don't need to normalize + if resource_format.upper() == "CSV": + logger.info(f"Normalizing/UTF-8 transcoding {resource_format}...") + else: + # if not CSV (e.g. TSV, TAB, etc.) we need to normalize to CSV + logger.info(f"Normalizing/UTF-8 transcoding {resource_format} to CSV...") + + qsv_input_utf_8_encoded_csv = os.path.join( + temp_dir, "qsv_input_utf_8_encoded.csv" + ) + + # using uchardet to determine encoding + file_encoding = subprocess.run( + ["uchardet", tmp], + check=True, + capture_output=True, + text=True, + ) + logger.info(f"Identified encoding of the file: {file_encoding.stdout}") + + # trim the encoding string + file_encoding.stdout = file_encoding.stdout.strip() + + # using iconv to re-encode in UTF-8 OR ASCII (as ASCII is a subset of UTF-8) + if file_encoding.stdout != "UTF-8" and file_encoding.stdout != "ASCII": + logger.info( + f"File is not UTF-8 encoded. Re-encoding from {file_encoding.stdout} to UTF-8" + ) + try: + cmd = subprocess.run( + [ + "iconv", + "-f", + file_encoding.stdout, + "-t", + "UTF-8", + tmp, + ], + capture_output=True, + check=True, + ) + except subprocess.CalledProcessError as e: + raise utils.JobError( + f"Job aborted as the file cannot be re-encoded to UTF-8. {e.stderr}" + ) + f = open(qsv_input_utf_8_encoded_csv, "wb") + f.write(cmd.stdout) + f.close() + logger.info("Successfully re-encoded to UTF-8") + + else: + qsv_input_utf_8_encoded_csv = tmp + try: + qsv.input(tmp, trim_headers=True, output_file=qsv_input_csv) + except utils.JobError as e: + raise utils.JobError( + f"Job aborted as the file cannot be normalized/transcoded: {e}." + ) + tmp = qsv_input_csv + logger.info("Normalized & transcoded...") + + # ------------------------------------- Validate CSV -------------------------------------- + # Run an RFC4180 check with `qsv validate` against the normalized, UTF-8 encoded CSV file. + # Even excel exported CSVs can be potentially invalid, as it allows the export of "flexible" + # CSVs - i.e. rows may have different column counts. + # If it passes validation, we can handle it with confidence downstream as a "normal" CSV. + logger.info("Validating CSV...") + try: + qsv.validate(tmp) + except utils.JobError as e: + raise utils.JobError(f"qsv validate failed: {e}") + + logger.info("Well-formed, valid CSV file confirmed...") + + # --------------------- Sortcheck -------------------------- + # if SORT_AND_DUPE_CHECK is True or DEDUP is True + # check if the file is sorted and if it has duplicates + # get the record count, unsorted breaks and duplicate count as well + if conf.SORT_AND_DUPE_CHECK or conf.DEDUP: + logger.info("Checking for duplicates and if the CSV is sorted...") + + try: + qsv_sortcheck = qsv.sortcheck(tmp, json_output=True, uses_stdio=True) + except utils.JobError as e: + raise utils.JobError( + f"Failed to check if CSV is sorted and has duplicates: {e}" + ) + + try: + # Handle both subprocess.CompletedProcess and dict outputs + stdout_content = ( + qsv_sortcheck.stdout + if hasattr(qsv_sortcheck, "stdout") + else qsv_sortcheck.get("stdout") + ) + sortcheck_json = json.loads(str(stdout_content)) + except (json.JSONDecodeError, AttributeError) as e: + raise utils.JobError(f"Failed to parse sortcheck JSONoutput: {e}") + + try: + # Extract and validate required fields + is_sorted = bool(sortcheck_json.get("sorted", False)) + record_count = int(sortcheck_json.get("record_count", 0)) + unsorted_breaks = int(sortcheck_json.get("unsorted_breaks", 0)) + dupe_count = int(sortcheck_json.get("dupe_count", 0)) + dataset_stats["IS_SORTED"] = is_sorted + dataset_stats["RECORD_COUNT"] = record_count + dataset_stats["UNSORTED_BREAKS"] = unsorted_breaks + dataset_stats["DUPE_COUNT"] = dupe_count + except (ValueError, TypeError) as e: + raise utils.JobError(f"Invalid numeric value in sortcheck output: {e}") + + # Format the message with clear statistics + sortcheck_msg = f"Sorted: {is_sorted}; Unsorted breaks: {unsorted_breaks:,}" + if is_sorted and dupe_count > 0: + sortcheck_msg = f"{sortcheck_msg}; Duplicates: {dupe_count:,}" + + logger.info(sortcheck_msg) + + # --------------- Do we need to dedup? ------------------ + if conf.DEDUP and dupe_count > 0: + qsv_dedup_csv = os.path.join(temp_dir, "qsv_dedup.csv") + logger.info(f"{dupe_count} duplicate rows found. Deduping...") + + try: + qsv.extdedup(tmp, qsv_dedup_csv) + except utils.JobError as e: + raise utils.JobError(f"Check for duplicates error: {e}") + + dataset_stats["DEDUPED"] = True + tmp = qsv_dedup_csv + logger.info(f"Deduped CSV saved to {qsv_dedup_csv}") + else: + dataset_stats["DEDUPED"] = False + + # ----------------------- Headers & Safenames --------------------------- + # get existing header names, so we can use them for data dictionary labels + # should we need to change the column name to make it "db-safe" + try: + qsv_headers = qsv.headers(tmp, just_names=True) + except utils.JobError as e: + raise utils.JobError(f"Cannot scan CSV headers: {e}") + original_headers = str(qsv_headers.stdout).strip() + original_header_dict = { + idx: ele for idx, ele in enumerate(original_headers.splitlines()) + } + + # now, ensure our column/header names identifiers are "safe names" + # i.e. valid postgres/CKAN Datastore identifiers + qsv_safenames_csv = os.path.join(temp_dir, "qsv_safenames.csv") + logger.info('Checking for "database-safe" header names...') + try: + qsv_safenames = qsv.safenames( + tmp, + mode="json", + reserved=conf.RESERVED_COLNAMES, + prefix=conf.UNSAFE_PREFIX, + uses_stdio=True, + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot scan CSV headers: {e}") + + unsafe_json = json.loads(str(qsv_safenames.stdout)) + unsafe_headers = unsafe_json["unsafe_headers"] + + if unsafe_headers: + logger.info( + f'"{len(unsafe_headers)} unsafe" header names found ({unsafe_headers}). Sanitizing..."' + ) + qsv_safenames = qsv.safenames( + tmp, mode="conditional", output_file=qsv_safenames_csv + ) + tmp = qsv_safenames_csv + else: + logger.info("No unsafe header names found...") + + # ---------------------- Type Inferencing ----------------------- + # at this stage, we have a "clean" CSV ready for Type Inferencing + + # first, index csv for speed - count, stats and slice + # are all accelerated/multithreaded when an index is present + try: + qsv_index_file = tmp + ".idx" + qsv.index(tmp) + except utils.JobError as e: + raise utils.JobError(f"Cannot index CSV: {e}") + + # if SORT_AND_DUPE_CHECK = True, we already know the record count + # so we can skip qsv count. + if not conf.SORT_AND_DUPE_CHECK: + # get record count, this is instantaneous with an index + try: + qsv_count = qsv.count(tmp) + record_count = int(str(qsv_count.stdout).strip()) + dataset_stats["RECORD_COUNT"] = record_count + except utils.JobError as e: + raise utils.JobError(f"Cannot count records in CSV: {e}") + + # its empty, nothing to do + if record_count == 0: + logger.warning("Upload skipped as there are zero records.") + return + + # log how many records we detected + unique_qualifier = "" + if conf.DEDUP: + unique_qualifier = "unique" + logger.info(f"{record_count} {unique_qualifier} records detected...") + + # run qsv stats to get data types and summary statistics + logger.info("Inferring data types and compiling statistics...") + headers = [] + types = [] + headers_min = [] + headers_max = [] + headers_cardinality = [] + qsv_stats_csv = os.path.join(temp_dir, "qsv_stats.csv") + + try: + # If the file is a spatial format, we need to use --max-length + # to truncate overly long strings from causing issues with + # Python's CSV reader and Postgres's limits with the COPY command + if spatial_format_flag: + env = os.environ.copy() + env["QSV_STATS_STRING_MAX_LENGTH"] = str(conf.QSV_STATS_STRING_MAX_LENGTH) + qsv_stats = qsv.stats( + tmp, + infer_dates=True, + dates_whitelist=conf.QSV_DATES_WHITELIST, + stats_jsonl=True, + prefer_dmy=conf.PREFER_DMY, + cardinality=bool(conf.AUTO_INDEX_THRESHOLD), + summary_stats_options=conf.SUMMARY_STATS_OPTIONS, + output_file=qsv_stats_csv, + env=env, + ) + else: + qsv_stats = qsv.stats( + tmp, + infer_dates=True, + dates_whitelist=conf.QSV_DATES_WHITELIST, + stats_jsonl=True, + prefer_dmy=conf.PREFER_DMY, + cardinality=bool(conf.AUTO_INDEX_THRESHOLD), + summary_stats_options=conf.SUMMARY_STATS_OPTIONS, + output_file=qsv_stats_csv, + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot infer data types and compile statistics: {e}") + + # Dictionary to look up stats by resource field name + resource_fields_stats = {} + + with open(qsv_stats_csv, mode="r") as inp: + reader = csv.DictReader(inp) + for row in reader: + # Add to stats dictionary with resource field name as key + resource_fields_stats[row["field"]] = {"stats": row} + + fr = {k: v for k, v in row.items()} + schema_field = fr.get("field", "Unnamed Column") + if schema_field.startswith("qsv_"): + break + headers.append(schema_field) + types.append(fr.get("type", "String")) + headers_min.append(fr["min"]) + headers_max.append(fr["max"]) + if conf.AUTO_INDEX_THRESHOLD: + headers_cardinality.append(int(fr.get("cardinality") or 0)) + + # Get the field stats for each field in the headers list + existing = dsu.datastore_resource_exists(resource_id) + existing_info = None + if existing: + existing_info = dict( + (f["id"], f["info"]) for f in existing.get("fields", []) if "info" in f + ) + + # if this is an existing resource + # override with types user requested in Data Dictionary + if existing_info: + types = [ + { + "text": "String", + "numeric": "Float", + "timestamp": "DateTime", + }.get(existing_info.get(h, {}).get("type_override"), t) + for t, h in zip(types, headers) + ] + + # Delete existing datastore resource before proceeding. + if existing: + logger.info(f'Deleting existing resource "{resource_id}" from datastore.') + dsu.delete_datastore_resource(resource_id) + + # 1st pass of building headers_dict + # here we map inferred types to postgresql data types + default_type = "String" + temp_headers_dicts = [ + dict( + id=field[0], + type=conf.TYPE_MAPPING.get( + str(field[1]) if field[1] else default_type, "text" + ), + ) + for field in zip(headers, types) + ] + + # 2nd pass header_dicts, checking for smartint types. + # "smartint" will automatically select the best integer data type based on the + # min/max values of the column we got from qsv stats. + # We also set the Data Dictionary Label to original column names in case we made + # the names "db-safe" as the labels are used by DataTables_view to label columns + # we also take note of datetime/timestamp fields, so we can normalize them + # to RFC3339 format, which is Postgres COPY ready + datetimecols_list = [] + headers_dicts = [] + for idx, header in enumerate(temp_headers_dicts): + if header["type"] == "smartint": + if ( + int(headers_max[idx]) <= conf.POSTGRES_INT_MAX + and int(headers_min[idx]) >= conf.POSTGRES_INT_MIN + ): + header_type = "integer" + elif ( + int(headers_max[idx]) <= conf.POSTGRES_BIGINT_MAX + and int(headers_min[idx]) >= conf.POSTGRES_BIGINT_MIN + ): + header_type = "bigint" + else: + header_type = "numeric" + else: + header_type = header["type"] + if header_type == "timestamp": + datetimecols_list.append(header["id"]) + info_dict = dict(label=original_header_dict.get(idx, "Unnamed Column")) + headers_dicts.append(dict(id=header["id"], type=header_type, info=info_dict)) + + # Maintain data dictionaries from matching column names + # if data dictionary already exists for this resource as + # we want to preserve the user's data dictionary curations + if existing_info: + for h in headers_dicts: + if h["id"] in existing_info: + h["info"] = existing_info[h["id"]] + # create columns with types user requested + type_override = existing_info[h["id"]].get("type_override") + if type_override in list(conf.TYPE_MAPPING.values()): + h["type"] = type_override + + logger.info(f"Determined headers and types: {headers_dicts}...") + + # ----------------------- Frequency Table --------------------------- + # compile a frequency table for each column + qsv_freq_csv = os.path.join(temp_dir, "qsv_freq.csv") + + try: + qsv.frequency(tmp, limit=conf.QSV_FREQ_LIMIT, output_file=qsv_freq_csv) + except utils.JobError as e: + raise utils.JobError(f"Cannot create a frequency table: {e}") + + resource_fields_freqs = {} + try: + with open(qsv_freq_csv, "r") as f: + reader = csv.DictReader(f) + for row in reader: + field = row["field"] + value = row["value"] + count = row["count"] + percentage = row["percentage"] + + # Initialize list for field if it doesn't exist + if field not in resource_fields_freqs: + resource_fields_freqs[field] = [] + + # Append the frequency data as a dict to the field's list + resource_fields_freqs[field].append( + { + "value": value, + "count": count, + "percentage": percentage, + } + ) + + logger.trace(f"Resource fields freqs: {resource_fields_freqs}") + + except IOError as e: + raise utils.JobError("Could not open frequency CSV file: {}".format(e)) + + # ------------------- Do we need to create a Preview? ----------------------- + # if conf.PREVIEW_ROWS is not zero, create a preview using qsv slice + # we do the rows_to_copy > conf.PREVIEW_ROWS to check if we don't need to slice + # the CSV anymore if we only did a partial download of N conf.PREVIEW_ROWS already + rows_to_copy = record_count + if conf.PREVIEW_ROWS and record_count > conf.PREVIEW_ROWS: + if conf.PREVIEW_ROWS > 0: + # conf.PREVIEW_ROWS is positive, slice from the beginning + logger.info(f"Preparing {conf.PREVIEW_ROWS}-row preview...") + qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv") + try: + qsv.slice(tmp, length=conf.PREVIEW_ROWS, output_file=qsv_slice_csv) + except utils.JobError as e: + raise utils.JobError(f"Cannot create a preview slice: {e}") + rows_to_copy = conf.PREVIEW_ROWS + tmp = qsv_slice_csv + else: + # conf.PREVIEW_ROWS is negative, slice from the end + # TODO: do http range request so we don't have to download the whole file + # to slice from the end + slice_len = abs(conf.PREVIEW_ROWS) + logger.info(f"Preparing {slice_len}-row preview from the end...") + qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv") + try: + qsv.slice(tmp, start=-1, length=slice_len, output_file=qsv_slice_csv) + except utils.JobError as e: + raise utils.JobError(f"Cannot create a preview slice from the end: {e}") + rows_to_copy = slice_len + tmp = qsv_slice_csv + + dataset_stats["PREVIEW_FILE_SIZE"] = os.path.getsize(tmp) + dataset_stats["PREVIEW_RECORD_COUNT"] = rows_to_copy + + # ---------------- Normalize dates to RFC3339 format -------------------- + # if there are any datetime fields, normalize them to RFC3339 format + # so we can readily insert them as timestamps into postgresql with COPY + if datetimecols_list: + qsv_applydp_csv = os.path.join(temp_dir, "qsv_applydp.csv") + datecols = ",".join(datetimecols_list) + + logger.info( + f'Formatting dates "{datecols}" to ISO 8601/RFC 3339 format with PREFER_DMY: {conf.PREFER_DMY}...' + ) + try: + qsv.datefmt( + datecols, + tmp, + prefer_dmy=conf.PREFER_DMY, + output_file=qsv_applydp_csv, + ) + except utils.JobError as e: + raise utils.JobError(f"Applydp error: {e}") + tmp = qsv_applydp_csv + + # -------------------- QSV ANALYSIS DONE -------------------- + analysis_elapsed = time.perf_counter() - analysis_start + logger.info( + f"ANALYSIS DONE! Analyzed and prepped in {analysis_elapsed:,.2f} seconds." + ) + + # ----------------------------- PII Screening ------------------------------ + # we scan for Personally Identifiable Information (PII) using qsv's powerful + # searchset command which can SIMULTANEOUSLY compare several regexes per + # field in one pass + piiscreening_start = 0 + piiscreening_elapsed = 0 + pii_found = False + + if conf.PII_SCREENING: + piiscreening_start = time.perf_counter() + pii_found = screen_for_pii(tmp, resource, qsv, temp_dir, logger) + piiscreening_elapsed = time.perf_counter() - piiscreening_start + + dataset_stats["PII_SCREENING"] = conf.PII_SCREENING + dataset_stats["PII_FOUND"] = pii_found + + # delete the qsv index file manually + # as it was created by qsv index, and not by tempfile + os.remove(qsv_index_file) + + # at this stage, the resource is ready for COPYing to the Datastore + + if dry_run: + logger.warning("Dry run only. Returning without copying to the Datastore...") + return headers_dicts + + # ============================================================ + # COPY to Datastore + # ============================================================ + copy_start = time.perf_counter() + + if conf.PREVIEW_ROWS: + logger.info(f"COPYING {rows_to_copy}-row preview to Datastore...") + else: + logger.info(f"COPYING {rows_to_copy} rows to Datastore...") + + # first, let's create an empty datastore table w/ guessed types + dsu.send_resource_to_datastore( + resource=None, + resource_id=resource["id"], + headers=headers_dicts, + records=None, + aliases=None, + calculate_record_count=False, + ) + + copied_count = 0 + try: + raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL) + except psycopg2.Error as e: + raise utils.JobError(f"Could not connect to the Datastore: {e}") + else: + cur = raw_connection.cursor() + + # truncate table to use copy freeze option and further increase + # performance as there is no need for WAL logs to be maintained + # https://www.postgresql.org/docs/current/populate.html#POPULATE-COPY-FROM + try: + cur.execute( + sql.SQL("TRUNCATE TABLE {}").format(sql.Identifier(resource_id)) + ) + + except psycopg2.Error as e: + logger.warning(f"Could not TRUNCATE: {e}") + + col_names_list = [h["id"] for h in headers_dicts] + column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list) + copy_sql = sql.SQL( + "COPY {} ({}) FROM STDIN " + "WITH (FORMAT CSV, FREEZE 1, " + "HEADER 1, ENCODING 'UTF8');" + ).format( + sql.Identifier(resource_id), + column_names, + ) + # specify a 1MB buffer size for COPY read from disk + with open(tmp, "rb", conf.COPY_READBUFFER_SIZE) as f: + try: + cur.copy_expert(copy_sql, f, size=conf.COPY_READBUFFER_SIZE) + except psycopg2.Error as e: + raise utils.JobError(f"Postgres COPY failed: {e}") + else: + copied_count = cur.rowcount + + raw_connection.commit() + # this is needed to issue a VACUUM ANALYZE + raw_connection.set_isolation_level( + psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT + ) + analyze_cur = raw_connection.cursor() + analyze_cur.execute( + sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id)) + ) + analyze_cur.close() + + copy_elapsed = time.perf_counter() - copy_start + logger.info( + f'...copying done. Copied {copied_count} rows to "{resource_id}" in {copy_elapsed:,.2f} seconds.' + ) + + # ================================================================================================= + # INDEXING + # ================================================================================================= + # if AUTO_INDEX_THRESHOLD > 0 or AUTO_INDEX_DATES is true + # create indices automatically based on summary statistics + # For columns w/ cardinality = record_count, it's all unique values, create a unique index + # If AUTO_INDEX_DATES is true, index all date columns + # if a column's cardinality <= AUTO_INDEX_THRESHOLD, create an index for that column + if ( + conf.AUTO_INDEX_THRESHOLD + or (conf.AUTO_INDEX_DATES and datetimecols_list) + or conf.AUTO_UNIQUE_INDEX + ): + index_start = time.perf_counter() + logger.info( + f"AUTO-INDEXING. Auto-index threshold: {conf.AUTO_INDEX_THRESHOLD} unique value/s. Auto-unique index: {conf.AUTO_UNIQUE_INDEX} Auto-index dates: {conf.AUTO_INDEX_DATES} ..." + ) + index_cur = raw_connection.cursor() + + # if auto_index_threshold == -1 + # we index all the columns + if conf.AUTO_INDEX_THRESHOLD == -1: + conf.AUTO_INDEX_THRESHOLD = record_count + + index_count = 0 + for idx, cardinality in enumerate(headers_cardinality): + curr_col = headers[idx] + if ( + conf.AUTO_INDEX_THRESHOLD > 0 + or conf.AUTO_INDEX_DATES + or conf.AUTO_UNIQUE_INDEX + ): + if cardinality == record_count and conf.AUTO_UNIQUE_INDEX: + # all the values are unique for this column, create a unique index + if conf.PREVIEW_ROWS > 0: + unique_value_count = min(conf.PREVIEW_ROWS, cardinality) + else: + unique_value_count = cardinality + logger.info( + f'Creating UNIQUE index on "{curr_col}" for {unique_value_count} unique values...' + ) + try: + index_cur.execute( + sql.SQL("CREATE UNIQUE INDEX ON {} ({})").format( + sql.Identifier(resource_id), + sql.Identifier(curr_col), + ) + ) + except psycopg2.Error as e: + logger.warning( + f'Could not CREATE UNIQUE INDEX on "{curr_col}": {e}' + ) + index_count += 1 + elif cardinality <= conf.AUTO_INDEX_THRESHOLD or ( + conf.AUTO_INDEX_DATES and (curr_col in datetimecols_list) + ): + # cardinality <= auto_index_threshold or its a date and auto_index_date is true + # create an index + if curr_col in datetimecols_list: + logger.info( + f'Creating index on "{curr_col}" date column for {cardinality} unique value/s...' + ) + else: + logger.info( + f'Creating index on "{curr_col}" for {cardinality} unique value/s...' + ) + try: + index_cur.execute( + sql.SQL("CREATE INDEX ON {} ({})").format( + sql.Identifier(resource_id), + sql.Identifier(curr_col), + ) + ) + except psycopg2.Error as e: + logger.warning(f'Could not CREATE INDEX on "{curr_col}": {e}') + index_count += 1 + + index_cur.close() + raw_connection.commit() + + logger.info("Vacuum Analyzing table to optimize indices...") + + # this is needed to issue a VACUUM ANALYZE + raw_connection.set_isolation_level( + psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT + ) + analyze_cur = raw_connection.cursor() + analyze_cur.execute( + sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id)) + ) + analyze_cur.close() + + index_elapsed = time.perf_counter() - index_start + logger.info( + f'...indexing/vacuum analysis done. Indexed {index_count} column/s in "{resource_id}" in {index_elapsed:,.2f} seconds.' + ) + + # ============================================================ + # PROCESS DRUF JINJA2 FORMULAE + # ============================================================ + # Check if there are any fields with DRUF keys in the scheming_yaml + # There are two types of DRUF keys: + # 1. "formula": This is used to update the field value DIRECTLY + # when the resource is created/updated. It can update both package and resource fields. + # 2. "suggestion_formula": This is used to populate the suggestion + # popovers DURING data entry/curation. + # DRUF keys are stored as jinja2 template expressions in the scheming_yaml + # and are rendered using the Jinja2 template engine. + formulae_start = time.perf_counter() + + # Fetch the scheming_yaml and package + package_id = resource["package_id"] + scheming_yaml, package = dsu.get_scheming_yaml( + package_id, scheming_yaml_type="dataset" + ) + + # Check for suggestion_formula in dataset_fields + has_suggestion_formula = any( + isinstance(field, dict) + and any(key.startswith("suggestion_formula") for key in field.keys()) + for field in scheming_yaml["dataset_fields"] + ) + + if has_suggestion_formula: + + logger.info( + 'Found suggestion formulae in schema' + ) + + # Check for "dpp_suggestions" in scheming_yaml + schema_has_dpp_suggestions = any( + isinstance(field, dict) + and field.get("field_name") == "dpp_suggestions" + for field in scheming_yaml["dataset_fields"] + ) + if not schema_has_dpp_suggestions: + logger.error( + '"dpp_suggestions" field required but not found in your schema. Ensure that your scheming.yaml file contains the "dpp_suggestions" field as a json_object.' + ) + return + else: + logger.info( + 'Found "dpp_suggestions" field in schema' + ) + + # add "dpp_suggestions" to package if it does not exist + if "dpp_suggestions" not in package: + + logger.warning( + 'Warning: "dpp_suggestions" field required to process Suggestion Formulae is not found in this package. Adding "dpp_suggestions" to package' + ) + + try: + package["dpp_suggestions"] = {} + dsu.patch_package(package) + logger.warning( + '"dpp_suggestions" field added to package' + ) + + except Exception as e: + logger.error( + f'Error adding "dpp_suggestions" field {e}' + ) + return + else: + logger.info( + 'No suggestion formulae found' + ) + + logger.trace(f"package: {package}") + + # FIRST, INITIALIZE THE FORMULA PROCESSOR + formula_processor = j2h.FormulaProcessor( + scheming_yaml, + package, + resource, + resource_fields_stats, + resource_fields_freqs, + dataset_stats, + logger, + ) + + package.setdefault("dpp_suggestions", {})[ + "STATUS" + ] = "STARTING FORMULAE PROCESSING..." + dsu.patch_package(package) + + # Clear all lru_cache before processing formulae + dsu.datastore_search.cache_clear() + dsu.datastore_search_sql.cache_clear() + dsu.datastore_info.cache_clear() + dsu.index_exists.cache_clear() + + # SECOND, WE PROCESS THE FORMULAE THAT UPDATE THE + # PACKAGE AND RESOURCE FIELDS DIRECTLY + # using the package_patch CKAN API so we only update the fields + # with formulae + package_updates = formula_processor.process_formulae( + "package", "dataset_fields", "formula" + ) + if package_updates: + # Update package with formula results + package.update(package_updates) + status_msg = "PACKAGE formulae processed..." + package["dpp_suggestions"]["STATUS"] = status_msg + try: + patched_package = dsu.patch_package(package) + logger.debug(f"Package after patching: {patched_package}") + package = patched_package + logger.info(status_msg) + except Exception as e: + logger.error(f"Error patching package: {str(e)}") + + # Process resource formulae + # as this is a direct update, we update the resource dictionary directly + resource_updates = formula_processor.process_formulae( + "resource", "resource_fields", "formula" + ) + if resource_updates: + # Update resource with formula results + resource.update(resource_updates) + status_msg = "RESOURCE formulae processed..." + if resource.get("dpp_suggestions"): + resource["dpp_suggestions"]["STATUS"] = status_msg + else: + resource["dpp_suggestions"] = {"STATUS": status_msg} + logger.info(status_msg) + + # THIRD, WE PROCESS THE SUGGESTIONS THAT SHOW UP IN THE SUGGESTION POPOVER + # we update the package dpp_suggestions field + # from which the Suggestion popover UI will pick it up + package_suggestions = formula_processor.process_formulae( + "package", "dataset_fields", "suggestion_formula" + ) + if package_suggestions: + logger.trace(f"package_suggestions: {package_suggestions}") + revise_update_content = {"package": package_suggestions} + try: + status_msg = "PACKAGE suggestion formulae processed..." + revise_update_content["STATUS"] = status_msg + revised_package = dsu.revise_package( + package_id, update={"dpp_suggestions": revise_update_content} + ) + logger.trace(f"Package after revising: {revised_package}") + package = revised_package + logger.info(status_msg) + except Exception as e: + logger.error(f"Error revising package: {str(e)}") + + # Process resource suggestion formulae + # Note how we still update the PACKAGE dpp_suggestions field + # and there is NO RESOURCE dpp_suggestions field. + # This is because suggestion formulae are used to populate the + # suggestion popover DURING data entry/curation and suggestion formulae + # may update both package and resource fields. + resource_suggestions = formula_processor.process_formulae( + "resource", "resource_fields", "suggestion_formula" + ) + if resource_suggestions: + logger.trace(f"resource_suggestions: {resource_suggestions}") + resource_name = resource["name"] + revise_update_content = {"resource": {resource_name: resource_suggestions}} + + # Handle existing suggestions + if package.get("dpp_suggestions"): + package["dpp_suggestions"].update(revise_update_content["resource"]) + else: + package["dpp_suggestions"] = revise_update_content["resource"] + + try: + status_msg = "RESOURCE suggestion formulae processed..." + revise_update_content["STATUS"] = status_msg + + revised_package = dsu.revise_package( + package_id, update={"dpp_suggestions": revise_update_content} + ) + logger.trace(f"Package after revising: {revised_package}") + package = revised_package + logger.info(status_msg) + except Exception as e: + logger.error(f"Error revising package: {str(e)}") + + # -------------------- FORMULAE PROCESSING DONE -------------------- + formulae_elapsed = time.perf_counter() - formulae_start + logger.info( + f"FORMULAE PROCESSING DONE! Processed in {formulae_elapsed:,.2f} seconds." + ) + + # ============================================================ + # UPDATE RESOURCE METADATA + # ============================================================ + metadata_start = time.perf_counter() + logger.info("UPDATING RESOURCE METADATA...") + + # --------------------- AUTO-ALIASING ------------------------ + # aliases are human-readable, and make it easier to use than resource id hash + # when using the Datastore API and in SQL queries + alias = None + if conf.AUTO_ALIAS: + logger.info(f"AUTO-ALIASING. Auto-alias-unique: {conf.AUTO_ALIAS_UNIQUE} ...") + # get package info, so we can construct the alias + package = dsu.get_package(resource["package_id"]) + + resource_name = resource.get("name") + package_name = package.get("name") + owner_org = package.get("organization") + owner_org_name = "" + if owner_org: + owner_org_name = owner_org.get("name") + if resource_name and package_name and owner_org_name: + # we limit it to 55, so we still have space for sequence & stats suffix + # postgres max identifier length is 63 + alias = f"{resource_name}-{package_name}-{owner_org_name}"[:55] + # if AUTO_ALIAS_UNIQUE is true, check if the alias already exist, if it does + # add a sequence suffix so the new alias can be created + cur.execute( + "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of", + (alias + "%",), + ) + alias_query_result = cur.fetchone() + if alias_query_result: + alias_count = alias_query_result[0] + existing_alias_of = alias_query_result[1] + else: + alias_count = 0 + existing_alias_of = "" + if conf.AUTO_ALIAS_UNIQUE and alias_count > 1: + alias_sequence = alias_count + 1 + while True: + # we do this, so we're certain the new alias does not exist + # just in case they deleted an older alias with a lower sequence # + alias = f"{alias}-{alias_sequence:03}" + cur.execute( + "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of;", + (alias + "%",), + ) + alias_exists = cur.fetchone()[0] + if not alias_exists: + break + alias_sequence += 1 + elif alias_count == 1: + logger.warning( + f'Dropping existing alias "{alias}" for resource "{existing_alias_of}"...' + ) + try: + cur.execute( + sql.SQL("DROP VIEW IF EXISTS {}").format(sql.Identifier(alias)) + ) + except psycopg2.Error as e: + logger.warning(f"Could not drop alias/view: {e}") + + else: + logger.warning( + f"Cannot create alias: {resource_name}-{package_name}-{owner_org}" + ) + alias = None + + # -------- should we ADD_SUMMARY_STATS_RESOURCE? ------------- + # by default, we only add summary stats if we're not doing a partial download + # (otherwise, you're summarizing the preview, not the whole file) + # That is, unless SUMMARY_STATS_WITH_PREVIEW is set to true + if conf.ADD_SUMMARY_STATS_RESOURCE or conf.SUMMARY_STATS_WITH_PREVIEW: + stats_resource_id = resource_id + "-stats" + + # check if the stats already exist + existing_stats = dsu.datastore_resource_exists(stats_resource_id) + # Delete existing summary-stats before proceeding. + if existing_stats: + logger.info(f'Deleting existing summary stats "{stats_resource_id}".') + + cur.execute( + "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;", + (stats_resource_id + "%",), + ) + stats_alias_result = cur.fetchone() + if stats_alias_result: + existing_stats_alias_of = stats_alias_result[0] + + dsu.delete_datastore_resource(existing_stats_alias_of) + dsu.delete_resource(existing_stats_alias_of) + + stats_aliases = [stats_resource_id] + if conf.AUTO_ALIAS: + auto_alias_stats_id = alias + "-stats" + stats_aliases.append(auto_alias_stats_id) + + # check if the summary-stats alias already exist. We need to do this as summary-stats resources + # may end up having the same alias if AUTO_ALIAS_UNIQUE is False, so we need to drop the + # existing summary stats-alias. + existing_alias_stats = dsu.datastore_resource_exists(auto_alias_stats_id) + # Delete existing auto-aliased summary-stats before proceeding. + if existing_alias_stats: + logger.info( + f'Deleting existing alias summary stats "{auto_alias_stats_id}".' + ) + + cur.execute( + "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;", + (auto_alias_stats_id + "%",), + ) + result = cur.fetchone() + if result: + existing_stats_alias_of = result[0] + + dsu.delete_datastore_resource(existing_stats_alias_of) + dsu.delete_resource(existing_stats_alias_of) + + # run stats on stats CSV to get header names and infer data types + # we don't need summary statistics, so use the --typesonly option + try: + qsv_stats_stats = qsv.stats( + qsv_stats_csv, + typesonly=True, + ) + except utils.JobError as e: + raise utils.JobError(f"Cannot run stats on CSV stats: {e}") + + stats_stats = str(qsv_stats_stats.stdout).strip() + stats_stats_dict = [ + dict(id=ele.split(",")[0], type=conf.TYPE_MAPPING[ele.split(",")[1]]) + for idx, ele in enumerate(stats_stats.splitlines()[1:], 1) + ] + + logger.info(f"stats_stats_dict: {stats_stats_dict}") + + resource_name = resource.get("name") + stats_resource = { + "package_id": resource["package_id"], + "name": resource_name + " - Summary Statistics", + "format": "CSV", + "mimetype": "text/csv", + } + stats_response = dsu.send_resource_to_datastore( + stats_resource, + resource_id=None, + headers=stats_stats_dict, + records=None, + aliases=stats_aliases, + calculate_record_count=False, + ) + + logger.info(f"stats_response: {stats_response}") + + new_stats_resource_id = stats_response["result"]["resource_id"] + + # now COPY the stats to the datastore + col_names_list = [h["id"] for h in stats_stats_dict] + logger.info( + f'ADDING SUMMARY STATISTICS {col_names_list} in "{new_stats_resource_id}" with alias/es "{stats_aliases}"...' + ) + + column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list) + + copy_sql = sql.SQL( + "COPY {} ({}) FROM STDIN " + "WITH (FORMAT CSV, " + "HEADER 1, ENCODING 'UTF8');" + ).format( + sql.Identifier(new_stats_resource_id), + column_names, + ) + + with open(qsv_stats_csv, "rb") as f: + try: + cur.copy_expert(copy_sql, f) + except psycopg2.Error as e: + raise utils.JobError(f"Postgres COPY failed: {e}") + + stats_resource["id"] = new_stats_resource_id + stats_resource["summary_statistics"] = True + stats_resource["summary_of_resource"] = resource_id + dsu.update_resource(stats_resource) + + cur.close() + raw_connection.commit() + raw_connection.close() + + resource["datastore_active"] = True + resource["total_record_count"] = record_count + if conf.PREVIEW_ROWS < record_count or (conf.PREVIEW_ROWS > 0): + resource["preview"] = True + resource["preview_rows"] = copied_count + else: + resource["preview"] = False + resource["preview_rows"] = None + resource["partial_download"] = False + dsu.update_resource(resource) + + # tell CKAN to calculate_record_count and set alias if set + dsu.send_resource_to_datastore( + resource=None, + resource_id=resource["id"], + headers=headers_dicts, + records=None, + aliases=alias, + calculate_record_count=True, + ) + + if alias: + logger.info(f'Created alias "{alias}" for "{resource_id}"...') + + metadata_elapsed = time.perf_counter() - metadata_start + logger.info( + f"RESOURCE METADATA UPDATES DONE! Resource metadata updated in {metadata_elapsed:,.2f} seconds." + ) + + # -------------------- DONE -------------------- + package.setdefault("dpp_suggestions", {})["STATUS"] = "DONE" + dsu.patch_package(package) + + total_elapsed = time.perf_counter() - timer_start + newline_var = "\n" + end_msg = f""" + DATAPUSHER+ JOB DONE! +   Download: {fetch_elapsed:,.2f} +   Analysis: {analysis_elapsed:,.2f}{(newline_var + f" PII Screening: {piiscreening_elapsed:,.2f}") if piiscreening_elapsed > 0 else ""} +   COPYing: {copy_elapsed:,.2f} +   Indexing: {index_elapsed:,.2f} +   Formulae processing: {formulae_elapsed:,.2f} +   Resource metadata updates: {metadata_elapsed:,.2f} + TOTAL ELAPSED TIME: {total_elapsed:,.2f} + """ + logger.info(end_msg)