diff --git a/ckanext/datapusher_plus/jobs.py b/ckanext/datapusher_plus/jobs.py
index 7d2781a..c7bdca1 100644
--- a/ckanext/datapusher_plus/jobs.py
+++ b/ckanext/datapusher_plus/jobs.py
@@ -1,1625 +1,31 @@
 # -*- coding: utf-8 -*-
-# flake8: noqa: E501
-
-# Standard library imports
-import csv
-import hashlib
-import locale
-import mimetypes
-import os
-import subprocess
-import tempfile
-import time
-from urllib.parse import urlsplit, urlparse
-import logging
-import uuid
-import sys
-import json
-import requests
-from pathlib import Path
-from typing import Dict, Any, Optional, List
-
-# Third-party imports
-import psycopg2
-from psycopg2 import sql
-from datasize import DataSize
-from dateutil.parser import parse as parsedate
-import traceback
-import sqlalchemy as sa
-from rq import get_current_job
-
-import ckanext.datapusher_plus.utils as utils
-import ckanext.datapusher_plus.helpers as dph
-import ckanext.datapusher_plus.jinja2_helpers as j2h
-from ckanext.datapusher_plus.job_exceptions import HTTPError
-import ckanext.datapusher_plus.config as conf
-import ckanext.datapusher_plus.spatial_helpers as sh
-import ckanext.datapusher_plus.datastore_utils as dsu
-from ckanext.datapusher_plus.logging_utils import TRACE
-from ckanext.datapusher_plus.qsv_utils import QSVCommand
-from ckanext.datapusher_plus.pii_screening import screen_for_pii
-
-if locale.getdefaultlocale()[0]:
-    lang, encoding = locale.getdefaultlocale()
-    locale.setlocale(locale.LC_ALL, locale=(lang, encoding))
-else:
-    locale.setlocale(locale.LC_ALL, "")
-
-
-def validate_input(input: Dict[str, Any]) -> None:
-    # Especially validate metadata which is provided by the user
-    if "metadata" not in input:
-        raise utils.JobError("Metadata missing")
-
-    data = input["metadata"]
-
-    if "resource_id" not in data:
-        raise utils.JobError("No id provided.")
-
-
-def callback_datapusher_hook(result_url: str, job_dict: Dict[str, Any]) -> bool:
-    api_token = utils.get_dp_plus_user_apitoken()
-    headers: Dict[str, str] = {
-        "Content-Type": "application/json",
-        "Authorization": api_token,
-    }
-
-    try:
-        result = requests.post(
-            result_url,
-            data=json.dumps(job_dict, cls=utils.DatetimeJsonEncoder),
-            verify=conf.SSL_VERIFY,
-            headers=headers,
-        )
-    except requests.ConnectionError:
-        return False
-
-    return result.status_code == requests.codes.ok
-
-
-def datapusher_plus_to_datastore(input: Dict[str, Any]) -> Optional[str]:
-    """
-    This is the main function that is called by the datapusher_plus worker
-
-    Errors are caught and logged in the database
-
-    Args:
-        input: Dictionary containing metadata and other job information
-
-    Returns:
-        Optional[str]: Returns "error" if there was an error, None otherwise
-    """
-    job_dict: Dict[str, Any] = dict(metadata=input["metadata"], status="running")
-    callback_datapusher_hook(result_url=input["result_url"], job_dict=job_dict)
-
-    job_id = get_current_job().id
-    errored = False
-    try:
-        push_to_datastore(input, job_id)
-        job_dict["status"] = "complete"
-        dph.mark_job_as_completed(job_id, job_dict)
-    except utils.JobError as e:
-        dph.mark_job_as_errored(job_id, str(e))
-        job_dict["status"] = "error"
-        job_dict["error"] = str(e)
-        log = logging.getLogger(__name__)
-        log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}")
-        errored = True
-    except Exception as e:
-        dph.mark_job_as_errored(
-            job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e)
-        )
-        job_dict["status"] = "error"
-        job_dict["error"] = str(e)
-        log = logging.getLogger(__name__)
-        log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}")
-        errored = True
-    finally:
-        # job_dict is defined in datapusher_hook's docstring
-        is_saved_ok = callback_datapusher_hook(
-            result_url=input["result_url"], job_dict=job_dict
-        )
-        errored = errored or not is_saved_ok
-    return "error" if errored else None
-
-
-def push_to_datastore(
-    input: Dict[str, Any], task_id: str, dry_run: bool = False
-) -> Optional[List[Dict[str, Any]]]:
-    """Download and parse a resource push its data into CKAN's DataStore.
-
-    An asynchronous job that gets a resource from CKAN, downloads the
-    resource's data file and, if the data file has changed since last time,
-    parses the data and posts it into CKAN's DataStore.
-
-    Args:
-        input: Dictionary containing metadata and other job information
-        task_id: Unique identifier for the task
-        dry_run: If True, fetch and parse the data file but don't actually post the
-            data to the DataStore, instead return the data headers and rows that
-            would have been posted.
-
-    Returns:
-        Optional[List[Dict[str, Any]]]: If dry_run is True, returns the headers and rows
-            that would have been posted. Otherwise returns None.
-    """
-    # Ensure temporary files are removed after run
-    with tempfile.TemporaryDirectory() as temp_dir:
-        return _push_to_datastore(task_id, input, dry_run=dry_run, temp_dir=temp_dir)
-
-
-def _push_to_datastore(
-    task_id: str,
-    input: Dict[str, Any],
-    dry_run: bool = False,
-    temp_dir: Optional[str] = None,
-) -> Optional[List[Dict[str, Any]]]:
-    # add job to dn  (datapusher_plus_jobs table)
-    try:
-        dph.add_pending_job(task_id, **input)
-    except sa.exc.IntegrityError:
-        raise utils.JobError("Job already exists.")
-    handler = utils.StoringHandler(task_id, input)
-    logger = logging.getLogger(task_id)
-    logger.addHandler(handler)
-
-    # also show logs on stderr
-    logger.addHandler(logging.StreamHandler())
-
-    # set the log level to the config upload_log_level
-    try:
-        log_level = getattr(logging, conf.UPLOAD_LOG_LEVEL.upper())
-    except AttributeError:
-        # fallback to our custom TRACE level
-        log_level = TRACE
-
-    # set the log level to the config upload_log_level
-    logger.setLevel(logging.INFO)
-    logger.info(f"Setting log level to {logging.getLevelName(int(log_level))}")
-    logger.setLevel(log_level)
-
-    # check if conf.QSV_BIN exists
-    if not Path(conf.QSV_BIN).is_file():
-        raise utils.JobError(f"{conf.QSV_BIN} not found.")
-
-    # Initialize QSVCommand
-    qsv = QSVCommand(logger=logger)
-
-    validate_input(input)
-
-    data = input["metadata"]
-
-    ckan_url = data["ckan_url"]
-    resource_id = data["resource_id"]
-    try:
-        resource = dsu.get_resource(resource_id)
-    except utils.JobError:
-        # try again in 5 seconds just incase CKAN is slow at adding resource
-        time.sleep(5)
-        resource = dsu.get_resource(resource_id)
-
-    # check if the resource url_type is a datastore
-    if resource.get("url_type") == "datastore":
-        logger.info("Dump files are managed with the Datastore API")
-        return
-
-    # check scheme
-    resource_url = resource.get("url")
-    scheme = urlsplit(resource_url).scheme
-    if scheme not in ("http", "https", "ftp"):
-        raise utils.JobError("Only http, https, and ftp resources may be fetched.")
-
-    # ==========================================================================
-    # DOWNLOAD
-    # ==========================================================================
-    timer_start = time.perf_counter()
-    dataset_stats = {}
-
-    # fetch the resource data
-    logger.info(f"Fetching from: {resource_url}...")
-    headers: Dict[str, str] = {}
-    if resource.get("url_type") == "upload":
-        # If this is an uploaded file to CKAN, authenticate the request,
-        # otherwise we won't get file from private resources
-        api_token = utils.get_dp_plus_user_apitoken()
-        headers["Authorization"] = api_token
-
-        # If the ckan_url differs from this url, rewrite this url to the ckan
-        # url. This can be useful if ckan is behind a firewall.
-        if not resource_url.startswith(ckan_url):
-            new_url = urlparse(resource_url)
-            rewrite_url = urlparse(ckan_url)
-            new_url = new_url._replace(
-                scheme=rewrite_url.scheme, netloc=rewrite_url.netloc
-            )
-            resource_url = new_url.geturl()
-            logger.info(f"Rewritten resource url to: {resource_url}")
-
-    try:
-        kwargs: Dict[str, Any] = {
-            "headers": headers,
-            "timeout": conf.TIMEOUT,
-            "verify": conf.SSL_VERIFY,
-            "stream": True,
-        }
-        if conf.USE_PROXY:
-            kwargs["proxies"] = {
-                "http": conf.DOWNLOAD_PROXY,
-                "https": conf.DOWNLOAD_PROXY,
-            }
-        with requests.get(resource_url, **kwargs) as response:
-            response.raise_for_status()
-
-            cl = response.headers.get("content-length")
-            max_content_length = conf.MAX_CONTENT_LENGTH
-            ct = response.headers.get("content-type")
-
-            try:
-                if cl and int(cl) > max_content_length and conf.PREVIEW_ROWS > 0:
-                    raise utils.JobError(
-                        f"Resource too large to download: {DataSize(int(cl)):.2MB} > max ({DataSize(int(max_content_length)):.2MB})."
-                    )
-            except ValueError:
-                pass
-
-            resource_format = resource.get("format").upper()
-
-            # if format was not specified, try to get it from mime type
-            if not resource_format:
-                logger.info("File format: NOT SPECIFIED")
-                # if we have a mime type, get the file extension from the response header
-                if ct:
-                    resource_format = mimetypes.guess_extension(ct.split(";")[0])
-
-                    if resource_format is None:
-                        raise utils.JobError(
-                            "Cannot determine format from mime type. Please specify format."
-                        )
-                    logger.info(f"Inferred file format: {resource_format}")
-                else:
-                    raise utils.JobError(
-                        "Server did not return content-type. Please specify format."
-                    )
-            else:
-                logger.info(f"File format: {resource_format}")
-
-            tmp = os.path.join(temp_dir, "tmp." + resource_format)
-            length = 0
-            # using MD5 for file deduplication only
-            # no need for it to be cryptographically secure
-            m = hashlib.md5()  # DevSkim: ignore DS126858
-
-            # download the file
-            if cl:
-                logger.info(f"Downloading {DataSize(int(cl)):.2MB} file...")
-            else:
-                logger.info("Downloading file of unknown size...")
-
-            with open(tmp, "wb") as tmp_file:
-                for chunk in response.iter_content(conf.CHUNK_SIZE):
-                    length += len(chunk)
-                    if length > max_content_length and not conf.PREVIEW_ROWS:
-                        raise utils.JobError(
-                            f"Resource too large to process: {length} > max ({max_content_length})."
-                        )
-                    tmp_file.write(chunk)
-                    m.update(chunk)
-
-    except requests.HTTPError as e:
-        raise HTTPError(
-            f"DataPusher+ received a bad HTTP response when trying to download "
-            f"the data file from {resource_url}. Status code: {e.response.status_code}, "
-            f"Response content: {e.response.content}",
-            status_code=e.response.status_code,
-            request_url=resource_url,
-            response=e.response.content,
-        )
-    except requests.RequestException as e:
-        raise HTTPError(
-            message=str(e),
-            status_code=None,
-            request_url=resource_url,
-            response=None,
-        )
-
-    file_hash = m.hexdigest()
-    dataset_stats["ORIGINAL_FILE_SIZE"] = length
-
-    # check if the resource metadata (like data dictionary data types)
-    # has been updated since the last fetch
-    resource_updated = False
-    resource_last_modified = resource.get("last_modified")
-    if resource_last_modified:
-        resource_last_modified = parsedate(resource_last_modified)
-        file_last_modified = response.headers.get("last-modified")
-        if file_last_modified:
-            file_last_modified = parsedate(file_last_modified).replace(tzinfo=None)
-            if file_last_modified < resource_last_modified:
-                resource_updated = True
-
-    if (
-        resource.get("hash") == file_hash
-        and not data.get("ignore_hash")
-        and not conf.IGNORE_FILE_HASH
-        and not resource_updated
-    ):
-        logger.warning(f"Upload skipped as the file hash hasn't changed: {file_hash}.")
-        return
-
-    resource["hash"] = file_hash
-
-    fetch_elapsed = time.perf_counter() - timer_start
-    logger.info(
-        f"Fetched {DataSize(length):.2MB} file in {fetch_elapsed:,.2f} seconds."
-    )
-
-    # Check if the file is a zip file
-    unzipped_format = ""
-    if resource_format.upper() == "ZIP":
-        logger.info("Processing ZIP file...")
-
-        file_count, extracted_path, unzipped_format = dph.extract_zip_or_metadata(
-            tmp, temp_dir, logger
-        )
-        if not file_count:
-            logger.error("ZIP file invalid or no files found in ZIP file.")
-            return
-        logger.info(
-            f"More than one file in the ZIP file ({file_count} files), saving metadata..."
-            if file_count > 1
-            else f"Extracted {unzipped_format} file: {extracted_path}"
-        )
-        tmp = extracted_path
-
-    # ===================================================================================
-    # ANALYZE WITH QSV
-    # ===================================================================================
-    # Start Analysis using qsv instead of messytables, as
-    # 1) its type inferences are bullet-proof not guesses as it scans the entire file,
-    # 2) its super-fast, and
-    # 3) it has addl data-wrangling capabilities we use in DP+ (e.g. stats, dedup, etc.)
-    dupe_count = 0
-    record_count = 0
-    analysis_start = time.perf_counter()
-    logger.info("ANALYZING WITH QSV..")
-
-    # flag to check if the file is a spatial format
-    spatial_format_flag = False
-    simplification_failed_flag = False
-    # ----------------- is it a spreadsheet? ---------------
-    # check content type or file extension if its a spreadsheet
-    spreadsheet_extensions = ["XLS", "XLSX", "ODS", "XLSM", "XLSB"]
-    file_format = resource.get("format").upper()
-    if (
-        file_format in spreadsheet_extensions
-        or unzipped_format in spreadsheet_extensions
-    ):
-        # if so, export spreadsheet as a CSV file
-        default_excel_sheet = conf.DEFAULT_EXCEL_SHEET
-        file_format = unzipped_format if unzipped_format != "" else file_format
-        logger.info(f"Converting {file_format} sheet {default_excel_sheet} to CSV...")
-        # first, we need a temporary spreadsheet filename with the right file extension
-        # we only need the filename though, that's why we remove it
-        # and create a hardlink to the file we got from CKAN
-        qsv_spreadsheet = os.path.join(temp_dir, "qsv_spreadsheet." + file_format)
-        os.link(tmp, qsv_spreadsheet)
-
-        # run `qsv excel` and export it to a CSV
-        # use --trim option to trim column names and the data
-        qsv_excel_csv = os.path.join(temp_dir, "qsv_excel.csv")
-        try:
-            qsv_excel = qsv.excel(
-                qsv_spreadsheet,
-                sheet=default_excel_sheet,
-                trim=True,
-                output_file=qsv_excel_csv,
-            )
-        except utils.JobError as e:
-            raise utils.JobError(
-                f"Upload aborted. Cannot export spreadsheet(?) to CSV: {e}"
-            )
-        excel_export_msg = qsv_excel.stderr
-        logger.info(f"{excel_export_msg}...")
-        tmp = qsv_excel_csv
-    elif resource_format.upper() in ["SHP", "QGIS", "GEOJSON"]:
-        logger.info("SHAPEFILE or GEOJSON file detected...")
-
-        qsv_spatial_file = os.path.join(
-            temp_dir,
-            "qsv_spatial_" + str(uuid.uuid4()) + "." + resource_format,
-        )
-        os.link(tmp, qsv_spatial_file)
-        qsv_spatial_csv = os.path.join(temp_dir, "qsv_spatial.csv")
-
-        if conf.AUTO_SPATIAL_SIMPLIFICATION:
-            # Try to convert spatial file to CSV using spatial_helpers
-            logger.info(
-                f"Converting spatial file to CSV with a simplification relative tolerance of {conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE}..."
-            )
-
-            try:
-                # Use the convert_to_csv function from spatial_helpers
-                success, error_message, bounds = sh.process_spatial_file(
-                    qsv_spatial_file,
-                    resource_format,
-                    output_csv_path=qsv_spatial_csv,
-                    tolerance=conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE,
-                    task_logger=logger,
-                )
-
-                if success:
-                    logger.info(
-                        "Spatial file successfully simplified and converted to CSV"
-                    )
-                    tmp = qsv_spatial_csv
-
-                    # Check if the simplified resource already exists
-                    simplified_resource_name = (
-                        os.path.splitext(resource["name"])[0]
-                        + "_simplified"
-                        + os.path.splitext(resource["name"])[1]
-                    )
-                    existing_resource, existing_resource_id = dsu.resource_exists(
-                        resource["package_id"], simplified_resource_name
-                    )
-
-                    if existing_resource:
-                        logger.info(
-                            "Simplified resource already exists. Replacing it..."
-                        )
-                        dsu.delete_resource(existing_resource_id)
-                    else:
-                        logger.info(
-                            "Simplified resource does not exist. Uploading it..."
-                        )
-                        new_simplified_resource = {
-                            "package_id": resource["package_id"],
-                            "name": os.path.splitext(resource["name"])[0]
-                            + "_simplified"
-                            + os.path.splitext(resource["name"])[1],
-                            "url": "",
-                            "format": resource["format"],
-                            "hash": "",
-                            "mimetype": resource["mimetype"],
-                            "mimetype_inner": resource["mimetype_inner"],
-                        }
-
-                        # Add bounds information if available
-                        if bounds:
-                            minx, miny, maxx, maxy = bounds
-                            new_simplified_resource.update(
-                                {
-                                    "dpp_spatial_extent": {
-                                        "type": "BoundingBox",
-                                        "coordinates": [
-                                            [minx, miny],
-                                            [maxx, maxy],
-                                        ],
-                                    }
-                                }
-                            )
-                            logger.info(
-                                f"Added dpp_spatial_extent to resource metadata: {bounds}"
-                            )
-
-                        dsu.upload_resource(new_simplified_resource, qsv_spatial_file)
-
-                        # delete the simplified spatial file
-                        os.remove(qsv_spatial_file)
-
-                    simplification_failed_flag = False
-                else:
-                    logger.warning(
-                        f"Upload of simplified spatial file failed: {error_message}"
-                    )
-                    simplification_failed_flag = True
-            except Exception as e:
-                logger.warning(f"Simplification and conversion failed: {str(e)}")
-                logger.warning(
-                    f"Simplification and conversion failed. Using qsv geoconvert to convert to CSV, truncating large columns to {conf.QSV_STATS_STRING_MAX_LENGTH} characters..."
-                )
-                simplification_failed_flag = True
-                pass
-
-        # If we are not auto-simplifying or simplification failed, use qsv geoconvert
-        if not conf.AUTO_SPATIAL_SIMPLIFICATION or simplification_failed_flag:
-            logger.info("Converting spatial file to CSV using qsv geoconvert...")
-
-            # Run qsv geoconvert
-            qsv_geoconvert_csv = os.path.join(temp_dir, "qsv_geoconvert.csv")
-            try:
-                qsv.geoconvert(
-                    tmp,
-                    resource_format,
-                    "csv",
-                    max_length=conf.QSV_STATS_STRING_MAX_LENGTH,
-                    output_file=qsv_geoconvert_csv,
-                )
-            except utils.JobError as e:
-                raise utils.JobError(f"qsv geoconvert failed: {e}")
-
-            tmp = qsv_geoconvert_csv
-            logger.info("Geoconverted successfully")
-
-    else:
-        # --- its not a spreadsheet nor a spatial format, its a CSV/TSV/TAB file ------
-        # Normalize & transcode to UTF-8 using `qsv input`. We need to normalize as
-        # it could be a CSV/TSV/TAB dialect with differing delimiters, quoting, etc.
-        # Using qsv input's --output option also auto-transcodes to UTF-8.
-        # Note that we only change the workfile, the resource file itself is unchanged.
-
-        # ------------------- Normalize to CSV ---------------------
-        qsv_input_csv = os.path.join(temp_dir, "qsv_input.csv")
-        # if resource_format is CSV we don't need to normalize
-        if resource_format.upper() == "CSV":
-            logger.info(f"Normalizing/UTF-8 transcoding {resource_format}...")
-        else:
-            # if not CSV (e.g. TSV, TAB, etc.) we need to normalize to CSV
-            logger.info(f"Normalizing/UTF-8 transcoding {resource_format} to CSV...")
-
-        qsv_input_utf_8_encoded_csv = os.path.join(
-            temp_dir, "qsv_input_utf_8_encoded.csv"
-        )
-
-        # using uchardet to determine encoding
-        file_encoding = subprocess.run(
-            ["uchardet", tmp],
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-        logger.info(f"Identified encoding of the file: {file_encoding.stdout}")
-
-        # trim the encoding string
-        file_encoding.stdout = file_encoding.stdout.strip()
-
-        # using iconv to re-encode in UTF-8 OR ASCII (as ASCII is a subset of UTF-8)
-        if file_encoding.stdout != "UTF-8" and file_encoding.stdout != "ASCII":
-            logger.info(
-                f"File is not UTF-8 encoded. Re-encoding from {file_encoding.stdout} to UTF-8"
-            )
-            try:
-                cmd = subprocess.run(
-                    [
-                        "iconv",
-                        "-f",
-                        file_encoding.stdout,
-                        "-t",
-                        "UTF-8",
-                        tmp,
-                    ],
-                    capture_output=True,
-                    check=True,
-                )
-            except subprocess.CalledProcessError as e:
-                raise utils.JobError(
-                    f"Job aborted as the file cannot be re-encoded to UTF-8. {e.stderr}"
-                )
-            f = open(qsv_input_utf_8_encoded_csv, "wb")
-            f.write(cmd.stdout)
-            f.close()
-            logger.info("Successfully re-encoded to UTF-8")
-
-        else:
-            qsv_input_utf_8_encoded_csv = tmp
-        try:
-            qsv.input(tmp, trim_headers=True, output_file=qsv_input_csv)
-        except utils.JobError as e:
-            raise utils.JobError(
-                f"Job aborted as the file cannot be normalized/transcoded: {e}."
-            )
-        tmp = qsv_input_csv
-        logger.info("Normalized & transcoded...")
-
-    # ------------------------------------- Validate CSV --------------------------------------
-    # Run an RFC4180 check with `qsv validate` against the normalized, UTF-8 encoded CSV file.
-    # Even excel exported CSVs can be potentially invalid, as it allows the export of "flexible"
-    # CSVs - i.e. rows may have different column counts.
-    # If it passes validation, we can handle it with confidence downstream as a "normal" CSV.
-    logger.info("Validating CSV...")
-    try:
-        qsv.validate(tmp)
-    except utils.JobError as e:
-        raise utils.JobError(f"qsv validate failed: {e}")
-
-    logger.info("Well-formed, valid CSV file confirmed...")
-
-    # --------------------- Sortcheck --------------------------
-    # if SORT_AND_DUPE_CHECK is True or DEDUP is True
-    # check if the file is sorted and if it has duplicates
-    # get the record count, unsorted breaks and duplicate count as well
-    if conf.SORT_AND_DUPE_CHECK or conf.DEDUP:
-        logger.info("Checking for duplicates and if the CSV is sorted...")
-
-        try:
-            qsv_sortcheck = qsv.sortcheck(tmp, json_output=True, uses_stdio=True)
-        except utils.JobError as e:
-            raise utils.JobError(
-                f"Failed to check if CSV is sorted and has duplicates: {e}"
-            )
-
-        try:
-            # Handle both subprocess.CompletedProcess and dict outputs
-            stdout_content = (
-                qsv_sortcheck.stdout
-                if hasattr(qsv_sortcheck, "stdout")
-                else qsv_sortcheck.get("stdout")
-            )
-            sortcheck_json = json.loads(str(stdout_content))
-        except (json.JSONDecodeError, AttributeError) as e:
-            raise utils.JobError(f"Failed to parse sortcheck JSONoutput: {e}")
-
-        try:
-            # Extract and validate required fields
-            is_sorted = bool(sortcheck_json.get("sorted", False))
-            record_count = int(sortcheck_json.get("record_count", 0))
-            unsorted_breaks = int(sortcheck_json.get("unsorted_breaks", 0))
-            dupe_count = int(sortcheck_json.get("dupe_count", 0))
-            dataset_stats["IS_SORTED"] = is_sorted
-            dataset_stats["RECORD_COUNT"] = record_count
-            dataset_stats["UNSORTED_BREAKS"] = unsorted_breaks
-            dataset_stats["DUPE_COUNT"] = dupe_count
-        except (ValueError, TypeError) as e:
-            raise utils.JobError(f"Invalid numeric value in sortcheck output: {e}")
-
-        # Format the message with clear statistics
-        sortcheck_msg = f"Sorted: {is_sorted}; Unsorted breaks: {unsorted_breaks:,}"
-        if is_sorted and dupe_count > 0:
-            sortcheck_msg = f"{sortcheck_msg}; Duplicates: {dupe_count:,}"
-
-        logger.info(sortcheck_msg)
-
-    # --------------- Do we need to dedup? ------------------
-    if conf.DEDUP and dupe_count > 0:
-        qsv_dedup_csv = os.path.join(temp_dir, "qsv_dedup.csv")
-        logger.info(f"{dupe_count} duplicate rows found. Deduping...")
-
-        try:
-            qsv.extdedup(tmp, qsv_dedup_csv)
-        except utils.JobError as e:
-            raise utils.JobError(f"Check for duplicates error: {e}")
-
-        dataset_stats["DEDUPED"] = True
-        tmp = qsv_dedup_csv
-        logger.info(f"Deduped CSV saved to {qsv_dedup_csv}")
-    else:
-        dataset_stats["DEDUPED"] = False
-
-    # ----------------------- Headers & Safenames ---------------------------
-    # get existing header names, so we can use them for data dictionary labels
-    # should we need to change the column name to make it "db-safe"
-    try:
-        qsv_headers = qsv.headers(tmp, just_names=True)
-    except utils.JobError as e:
-        raise utils.JobError(f"Cannot scan CSV headers: {e}")
-    original_headers = str(qsv_headers.stdout).strip()
-    original_header_dict = {
-        idx: ele for idx, ele in enumerate(original_headers.splitlines())
-    }
-
-    # now, ensure our column/header names identifiers are "safe names"
-    # i.e. valid postgres/CKAN Datastore identifiers
-    qsv_safenames_csv = os.path.join(temp_dir, "qsv_safenames.csv")
-    logger.info('Checking for "database-safe" header names...')
-    try:
-        qsv_safenames = qsv.safenames(
-            tmp,
-            mode="json",
-            reserved=conf.RESERVED_COLNAMES,
-            prefix=conf.UNSAFE_PREFIX,
-            uses_stdio=True,
-        )
-    except utils.JobError as e:
-        raise utils.JobError(f"Cannot scan CSV headers: {e}")
-
-    unsafe_json = json.loads(str(qsv_safenames.stdout))
-    unsafe_headers = unsafe_json["unsafe_headers"]
-
-    if unsafe_headers:
-        logger.info(
-            f'"{len(unsafe_headers)} unsafe" header names found ({unsafe_headers}). Sanitizing..."'
-        )
-        qsv_safenames = qsv.safenames(
-            tmp, mode="conditional", output_file=qsv_safenames_csv
-        )
-        tmp = qsv_safenames_csv
-    else:
-        logger.info("No unsafe header names found...")
-
-    # ---------------------- Type Inferencing -----------------------
-    # at this stage, we have a "clean" CSV ready for Type Inferencing
-
-    # first, index csv for speed - count, stats and slice
-    # are all accelerated/multithreaded when an index is present
-    try:
-        qsv_index_file = tmp + ".idx"
-        qsv.index(tmp)
-    except utils.JobError as e:
-        raise utils.JobError(f"Cannot index CSV: {e}")
-
-    # if SORT_AND_DUPE_CHECK = True, we already know the record count
-    # so we can skip qsv count.
-    if not conf.SORT_AND_DUPE_CHECK:
-        # get record count, this is instantaneous with an index
-        try:
-            qsv_count = qsv.count(tmp)
-            record_count = int(str(qsv_count.stdout).strip())
-            dataset_stats["RECORD_COUNT"] = record_count
-        except utils.JobError as e:
-            raise utils.JobError(f"Cannot count records in CSV: {e}")
-
-    # its empty, nothing to do
-    if record_count == 0:
-        logger.warning("Upload skipped as there are zero records.")
-        return
-
-    # log how many records we detected
-    unique_qualifier = ""
-    if conf.DEDUP:
-        unique_qualifier = "unique"
-    logger.info(f"{record_count} {unique_qualifier} records detected...")
-
-    # run qsv stats to get data types and summary statistics
-    logger.info("Inferring data types and compiling statistics...")
-    headers = []
-    types = []
-    headers_min = []
-    headers_max = []
-    headers_cardinality = []
-    qsv_stats_csv = os.path.join(temp_dir, "qsv_stats.csv")
-
-    try:
-        # If the file is a spatial format, we need to use --max-length
-        # to truncate overly long strings from causing issues with
-        # Python's CSV reader and Postgres's limits with the COPY command
-        if spatial_format_flag:
-            env = os.environ.copy()
-            env["QSV_STATS_STRING_MAX_LENGTH"] = str(conf.QSV_STATS_STRING_MAX_LENGTH)
-            qsv_stats = qsv.stats(
-                tmp,
-                infer_dates=True,
-                dates_whitelist=conf.QSV_DATES_WHITELIST,
-                stats_jsonl=True,
-                prefer_dmy=conf.PREFER_DMY,
-                cardinality=bool(conf.AUTO_INDEX_THRESHOLD),
-                summary_stats_options=conf.SUMMARY_STATS_OPTIONS,
-                output_file=qsv_stats_csv,
-                env=env,
-            )
-        else:
-            qsv_stats = qsv.stats(
-                tmp,
-                infer_dates=True,
-                dates_whitelist=conf.QSV_DATES_WHITELIST,
-                stats_jsonl=True,
-                prefer_dmy=conf.PREFER_DMY,
-                cardinality=bool(conf.AUTO_INDEX_THRESHOLD),
-                summary_stats_options=conf.SUMMARY_STATS_OPTIONS,
-                output_file=qsv_stats_csv,
-            )
-    except utils.JobError as e:
-        raise utils.JobError(f"Cannot infer data types and compile statistics: {e}")
-
-    # Dictionary to look up stats by resource field name
-    resource_fields_stats = {}
-
-    with open(qsv_stats_csv, mode="r") as inp:
-        reader = csv.DictReader(inp)
-        for row in reader:
-            # Add to stats dictionary with resource field name as key
-            resource_fields_stats[row["field"]] = {"stats": row}
-
-            fr = {k: v for k, v in row.items()}
-            schema_field = fr.get("field", "Unnamed Column")
-            if schema_field.startswith("qsv_"):
-                break
-            headers.append(schema_field)
-            types.append(fr.get("type", "String"))
-            headers_min.append(fr["min"])
-            headers_max.append(fr["max"])
-            if conf.AUTO_INDEX_THRESHOLD:
-                headers_cardinality.append(int(fr.get("cardinality") or 0))
-
-    # Get the field stats for each field in the headers list
-    existing = dsu.datastore_resource_exists(resource_id)
-    existing_info = None
-    if existing:
-        existing_info = dict(
-            (f["id"], f["info"]) for f in existing.get("fields", []) if "info" in f
-        )
-
-    # if this is an existing resource
-    # override with types user requested in Data Dictionary
-    if existing_info:
-        types = [
-            {
-                "text": "String",
-                "numeric": "Float",
-                "timestamp": "DateTime",
-            }.get(existing_info.get(h, {}).get("type_override"), t)
-            for t, h in zip(types, headers)
-        ]
-
-    # Delete existing datastore resource before proceeding.
-    if existing:
-        logger.info(f'Deleting existing resource "{resource_id}" from datastore.')
-        dsu.delete_datastore_resource(resource_id)
-
-    # 1st pass of building headers_dict
-    # here we map inferred types to postgresql data types
-    default_type = "String"
-    temp_headers_dicts = [
-        dict(
-            id=field[0],
-            type=conf.TYPE_MAPPING.get(
-                str(field[1]) if field[1] else default_type, "text"
-            ),
-        )
-        for field in zip(headers, types)
-    ]
-
-    # 2nd pass header_dicts, checking for smartint types.
-    # "smartint" will automatically select the best integer data type based on the
-    # min/max values of the column we got from qsv stats.
-    # We also set the Data Dictionary Label to original column names in case we made
-    # the names "db-safe" as the labels are used by DataTables_view to label columns
-    # we also take note of datetime/timestamp fields, so we can normalize them
-    # to RFC3339 format, which is Postgres COPY ready
-    datetimecols_list = []
-    headers_dicts = []
-    for idx, header in enumerate(temp_headers_dicts):
-        if header["type"] == "smartint":
-            if (
-                int(headers_max[idx]) <= conf.POSTGRES_INT_MAX
-                and int(headers_min[idx]) >= conf.POSTGRES_INT_MIN
-            ):
-                header_type = "integer"
-            elif (
-                int(headers_max[idx]) <= conf.POSTGRES_BIGINT_MAX
-                and int(headers_min[idx]) >= conf.POSTGRES_BIGINT_MIN
-            ):
-                header_type = "bigint"
-            else:
-                header_type = "numeric"
-        else:
-            header_type = header["type"]
-        if header_type == "timestamp":
-            datetimecols_list.append(header["id"])
-        info_dict = dict(label=original_header_dict.get(idx, "Unnamed Column"))
-        headers_dicts.append(dict(id=header["id"], type=header_type, info=info_dict))
-
-    # Maintain data dictionaries from matching column names
-    # if data dictionary already exists for this resource as
-    # we want to preserve the user's data dictionary curations
-    if existing_info:
-        for h in headers_dicts:
-            if h["id"] in existing_info:
-                h["info"] = existing_info[h["id"]]
-                # create columns with types user requested
-                type_override = existing_info[h["id"]].get("type_override")
-                if type_override in list(conf.TYPE_MAPPING.values()):
-                    h["type"] = type_override
-
-    logger.info(f"Determined headers and types: {headers_dicts}...")
-
-    # ----------------------- Frequency Table ---------------------------
-    # compile a frequency table for each column
-    qsv_freq_csv = os.path.join(temp_dir, "qsv_freq.csv")
-
-    try:
-        qsv.frequency(tmp, limit=conf.QSV_FREQ_LIMIT, output_file=qsv_freq_csv)
-    except utils.JobError as e:
-        raise utils.JobError(f"Cannot create a frequency table: {e}")
-
-    resource_fields_freqs = {}
-    try:
-        with open(qsv_freq_csv, "r") as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                field = row["field"]
-                value = row["value"]
-                count = row["count"]
-                percentage = row["percentage"]
-
-                # Initialize list for field if it doesn't exist
-                if field not in resource_fields_freqs:
-                    resource_fields_freqs[field] = []
-
-                # Append the frequency data as a dict to the field's list
-                resource_fields_freqs[field].append(
-                    {
-                        "value": value,
-                        "count": count,
-                        "percentage": percentage,
-                    }
-                )
-
-            logger.trace(f"Resource fields freqs: {resource_fields_freqs}")
-
-    except IOError as e:
-        raise utils.JobError("Could not open frequency CSV file: {}".format(e))
-
-    # ------------------- Do we need to create a Preview?  -----------------------
-    # if conf.PREVIEW_ROWS is not zero, create a preview using qsv slice
-    # we do the rows_to_copy > conf.PREVIEW_ROWS to check if we don't need to slice
-    # the CSV anymore if we only did a partial download of N conf.PREVIEW_ROWS already
-    rows_to_copy = record_count
-    if conf.PREVIEW_ROWS and record_count > conf.PREVIEW_ROWS:
-        if conf.PREVIEW_ROWS > 0:
-            # conf.PREVIEW_ROWS is positive, slice from the beginning
-            logger.info(f"Preparing {conf.PREVIEW_ROWS}-row preview...")
-            qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv")
-            try:
-                qsv.slice(tmp, length=conf.PREVIEW_ROWS, output_file=qsv_slice_csv)
-            except utils.JobError as e:
-                raise utils.JobError(f"Cannot create a preview slice: {e}")
-            rows_to_copy = conf.PREVIEW_ROWS
-            tmp = qsv_slice_csv
-        else:
-            # conf.PREVIEW_ROWS is negative, slice from the end
-            # TODO: do http range request so we don't have to download the whole file
-            # to slice from the end
-            slice_len = abs(conf.PREVIEW_ROWS)
-            logger.info(f"Preparing {slice_len}-row preview from the end...")
-            qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv")
-            try:
-                qsv.slice(tmp, start=-1, length=slice_len, output_file=qsv_slice_csv)
-            except utils.JobError as e:
-                raise utils.JobError(f"Cannot create a preview slice from the end: {e}")
-            rows_to_copy = slice_len
-            tmp = qsv_slice_csv
-
-        dataset_stats["PREVIEW_FILE_SIZE"] = os.path.getsize(tmp)
-        dataset_stats["PREVIEW_RECORD_COUNT"] = rows_to_copy
-
-    # ---------------- Normalize dates to RFC3339 format --------------------
-    # if there are any datetime fields, normalize them to RFC3339 format
-    # so we can readily insert them as timestamps into postgresql with COPY
-    if datetimecols_list:
-        qsv_applydp_csv = os.path.join(temp_dir, "qsv_applydp.csv")
-        datecols = ",".join(datetimecols_list)
-
-        logger.info(
-            f'Formatting dates "{datecols}" to ISO 8601/RFC 3339 format with PREFER_DMY: {conf.PREFER_DMY}...'
-        )
-        try:
-            qsv.datefmt(
-                datecols,
-                tmp,
-                prefer_dmy=conf.PREFER_DMY,
-                output_file=qsv_applydp_csv,
-            )
-        except utils.JobError as e:
-            raise utils.JobError(f"Applydp error: {e}")
-        tmp = qsv_applydp_csv
-
-    # -------------------- QSV ANALYSIS DONE --------------------
-    analysis_elapsed = time.perf_counter() - analysis_start
-    logger.info(
-        f"ANALYSIS DONE! Analyzed and prepped in {analysis_elapsed:,.2f} seconds."
-    )
-
-    # ----------------------------- PII Screening ------------------------------
-    # we scan for Personally Identifiable Information (PII) using qsv's powerful
-    # searchset command which can SIMULTANEOUSLY compare several regexes per
-    # field in one pass
-    piiscreening_start = 0
-    piiscreening_elapsed = 0
-    pii_found = False
-
-    if conf.PII_SCREENING:
-        piiscreening_start = time.perf_counter()
-        pii_found = screen_for_pii(tmp, resource, qsv, temp_dir, logger)
-        piiscreening_elapsed = time.perf_counter() - piiscreening_start
-
-    dataset_stats["PII_SCREENING"] = conf.PII_SCREENING
-    dataset_stats["PII_FOUND"] = pii_found
-
-    # delete the qsv index file manually
-    # as it was created by qsv index, and not by tempfile
-    os.remove(qsv_index_file)
-
-    # at this stage, the resource is ready for COPYing to the Datastore
-
-    if dry_run:
-        logger.warning("Dry run only. Returning without copying to the Datastore...")
-        return headers_dicts
-
-    # ============================================================
-    # COPY to Datastore
-    # ============================================================
-    copy_start = time.perf_counter()
-
-    if conf.PREVIEW_ROWS:
-        logger.info(f"COPYING {rows_to_copy}-row preview to Datastore...")
-    else:
-        logger.info(f"COPYING {rows_to_copy} rows to Datastore...")
-
-    # first, let's create an empty datastore table w/ guessed types
-    dsu.send_resource_to_datastore(
-        resource=None,
-        resource_id=resource["id"],
-        headers=headers_dicts,
-        records=None,
-        aliases=None,
-        calculate_record_count=False,
-    )
-
-    copied_count = 0
-    try:
-        raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL)
-    except psycopg2.Error as e:
-        raise utils.JobError(f"Could not connect to the Datastore: {e}")
-    else:
-        cur = raw_connection.cursor()
-
-        # truncate table in case we're loading over an existing resource
-        try:
-            cur.execute(
-                sql.SQL("TRUNCATE TABLE {}").format(sql.Identifier(resource_id))
-            )
-            # commit to ensure that the AccessExclusive lock is only held for the
-            # duration of the truncate, otherwise no other access to the table is
-            # allowed, blocking all selects. 
-            raw_connection.commit()
-        except psycopg2.Error as e:
-            logger.warning(f"Could not TRUNCATE: {e}")
-
-        col_names_list = [h["id"] for h in headers_dicts]
-        column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list)
-        copy_sql = sql.SQL(
-            "COPY {} ({}) FROM STDIN "
-            "WITH (FORMAT CSV, "
-            "HEADER 1, ENCODING 'UTF8');"
-        ).format(
-            sql.Identifier(resource_id),
-            column_names,
-        )
-        # specify a 1MB buffer size for COPY read from disk
-        with open(tmp, "rb", conf.COPY_READBUFFER_SIZE) as f:
-            try:
-                cur.copy_expert(copy_sql, f, size=conf.COPY_READBUFFER_SIZE)
-            except psycopg2.Error as e:
-                raise utils.JobError(f"Postgres COPY failed: {e}")
-            else:
-                copied_count = cur.rowcount
-
-        raw_connection.commit()
-        # this is needed to issue a VACUUM ANALYZE
-        raw_connection.set_isolation_level(
-            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT
-        )
-        analyze_cur = raw_connection.cursor()
-        analyze_cur.execute(
-            sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id))
-        )
-        analyze_cur.close()
-
-    copy_elapsed = time.perf_counter() - copy_start
-    logger.info(
-        f'...copying done. Copied {copied_count} rows to "{resource_id}" in {copy_elapsed:,.2f} seconds.'
-    )
-
-    # =================================================================================================
-    # INDEXING
-    # =================================================================================================
-    # if AUTO_INDEX_THRESHOLD > 0 or AUTO_INDEX_DATES is true
-    # create indices automatically based on summary statistics
-    # For columns w/ cardinality = record_count, it's all unique values, create a unique index
-    # If AUTO_INDEX_DATES is true, index all date columns
-    # if a column's cardinality <= AUTO_INDEX_THRESHOLD, create an index for that column
-    if (
-        conf.AUTO_INDEX_THRESHOLD
-        or (conf.AUTO_INDEX_DATES and datetimecols_list)
-        or conf.AUTO_UNIQUE_INDEX
-    ):
-        index_start = time.perf_counter()
-        logger.info(
-            f"AUTO-INDEXING. Auto-index threshold: {conf.AUTO_INDEX_THRESHOLD} unique value/s. Auto-unique index: {conf.AUTO_UNIQUE_INDEX} Auto-index dates: {conf.AUTO_INDEX_DATES} ..."
-        )
-        index_cur = raw_connection.cursor()
-
-        # if auto_index_threshold == -1
-        # we index all the columns
-        if conf.AUTO_INDEX_THRESHOLD == -1:
-            conf.AUTO_INDEX_THRESHOLD = record_count
-
-        index_count = 0
-        for idx, cardinality in enumerate(headers_cardinality):
-            curr_col = headers[idx]
-            if (
-                conf.AUTO_INDEX_THRESHOLD > 0
-                or conf.AUTO_INDEX_DATES
-                or conf.AUTO_UNIQUE_INDEX
-            ):
-                if cardinality == record_count and conf.AUTO_UNIQUE_INDEX:
-                    # all the values are unique for this column, create a unique index
-                    if conf.PREVIEW_ROWS > 0:
-                        unique_value_count = min(conf.PREVIEW_ROWS, cardinality)
-                    else:
-                        unique_value_count = cardinality
-                    logger.info(
-                        f'Creating UNIQUE index on "{curr_col}" for {unique_value_count} unique values...'
-                    )
-                    try:
-                        index_cur.execute(
-                            sql.SQL("CREATE UNIQUE INDEX ON {} ({})").format(
-                                sql.Identifier(resource_id),
-                                sql.Identifier(curr_col),
-                            )
-                        )
-                    except psycopg2.Error as e:
-                        logger.warning(
-                            f'Could not CREATE UNIQUE INDEX on "{curr_col}": {e}'
-                        )
-                    index_count += 1
-                elif cardinality <= conf.AUTO_INDEX_THRESHOLD or (
-                    conf.AUTO_INDEX_DATES and (curr_col in datetimecols_list)
-                ):
-                    # cardinality <= auto_index_threshold or its a date and auto_index_date is true
-                    # create an index
-                    if curr_col in datetimecols_list:
-                        logger.info(
-                            f'Creating index on "{curr_col}" date column for {cardinality} unique value/s...'
-                        )
-                    else:
-                        logger.info(
-                            f'Creating index on "{curr_col}" for {cardinality} unique value/s...'
-                        )
-                    try:
-                        index_cur.execute(
-                            sql.SQL("CREATE INDEX ON {} ({})").format(
-                                sql.Identifier(resource_id),
-                                sql.Identifier(curr_col),
-                            )
-                        )
-                    except psycopg2.Error as e:
-                        logger.warning(f'Could not CREATE INDEX on "{curr_col}": {e}')
-                    index_count += 1
-
-        index_cur.close()
-        raw_connection.commit()
-
-        logger.info("Vacuum Analyzing table to optimize indices...")
-
-        # this is needed to issue a VACUUM ANALYZE
-        raw_connection.set_isolation_level(
-            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT
-        )
-        analyze_cur = raw_connection.cursor()
-        analyze_cur.execute(
-            sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id))
-        )
-        analyze_cur.close()
-
-        index_elapsed = time.perf_counter() - index_start
-        logger.info(
-            f'...indexing/vacuum analysis done. Indexed {index_count} column/s in "{resource_id}" in {index_elapsed:,.2f} seconds.'
-        )
-
-    # ============================================================
-    # PROCESS DRUF JINJA2 FORMULAE
-    # ============================================================
-    # Check if there are any fields with DRUF keys in the scheming_yaml
-    # There are two types of DRUF keys:
-    # 1. "formula": This is used to update the field value DIRECTLY
-    #    when the resource is created/updated. It can update both package and resource fields.
-    # 2. "suggestion_formula": This is used to populate the suggestion
-    #    popovers DURING data entry/curation.
-    # DRUF keys are stored as jinja2 template expressions in the scheming_yaml
-    # and are rendered using the Jinja2 template engine.
-    formulae_start = time.perf_counter()
-
-    # Fetch the scheming_yaml and package
-    package_id = resource["package_id"]
-    scheming_yaml, package = dsu.get_scheming_yaml(
-        package_id, scheming_yaml_type="dataset"
-    )
-
-    # Check for suggestion_formula in dataset_fields
-    has_suggestion_formula = any(
-        isinstance(field, dict)
-        and any(key.startswith("suggestion_formula") for key in field.keys())
-        for field in scheming_yaml["dataset_fields"]
-    )
-
-    if has_suggestion_formula:
-
-        logger.info(
-            'Found suggestion formulae in schema'
-        )
-
-        # Check for "dpp_suggestions" in scheming_yaml
-        schema_has_dpp_suggestions = any(
-            isinstance(field, dict)
-            and field.get("field_name") == "dpp_suggestions"
-            for field in scheming_yaml["dataset_fields"]
-        )
-        if not schema_has_dpp_suggestions:
-            logger.error(
-                '"dpp_suggestions" field required but not found in your schema. Ensure that your scheming.yaml file contains the "dpp_suggestions" field as a json_object.'
-            )
-            return
-        else:
-            logger.info(
-                'Found "dpp_suggestions" field in schema'
-            )
-
-        # add "dpp_suggestions" to package if it does not exist
-        if "dpp_suggestions" not in package:
-
-            logger.warning(
-                'Warning: "dpp_suggestions" field required to process Suggestion Formulae is not found in this package. Adding "dpp_suggestions" to package'
-            )
-
-            try:
-                package["dpp_suggestions"] = {}
-                dsu.patch_package(package)
-                logger.warning(
-                    '"dpp_suggestions" field added to package'
-                )
-                
-            except Exception as e:
-                logger.error(
-                    f'Error adding "dpp_suggestions" field {e}'
-                )
-                return
-    else:
-        logger.info(
-            'No suggestion formulae found'
-        )
-
-    logger.trace(f"package: {package}")
-
-    # FIRST, INITIALIZE THE FORMULA PROCESSOR
-    formula_processor = j2h.FormulaProcessor(
-        scheming_yaml,
-        package,
-        resource,
-        resource_fields_stats,
-        resource_fields_freqs,
-        dataset_stats,
-        logger,
-    )
-
-    package.setdefault("dpp_suggestions", {})[
-        "STATUS"
-    ] = "STARTING FORMULAE PROCESSING..."
-    dsu.patch_package(package)
-
-    # Clear all lru_cache before processing formulae
-    dsu.datastore_search.cache_clear()
-    dsu.datastore_search_sql.cache_clear()
-    dsu.datastore_info.cache_clear()
-    dsu.index_exists.cache_clear()
-
-    # SECOND, WE PROCESS THE FORMULAE THAT UPDATE THE
-    # PACKAGE AND RESOURCE FIELDS DIRECTLY
-    # using the package_patch CKAN API so we only update the fields
-    # with formulae
-    package_updates = formula_processor.process_formulae(
-        "package", "dataset_fields", "formula"
-    )
-    if package_updates:
-        # Update package with formula results
-        package.update(package_updates)
-        status_msg = "PACKAGE formulae processed..."
-        package["dpp_suggestions"]["STATUS"] = status_msg
-        try:
-            patched_package = dsu.patch_package(package)
-            logger.debug(f"Package after patching: {patched_package}")
-            package = patched_package
-            logger.info(status_msg)
-        except Exception as e:
-            logger.error(f"Error patching package: {str(e)}")
-
-    # Process resource formulae
-    # as this is a direct update, we update the resource dictionary directly
-    resource_updates = formula_processor.process_formulae(
-        "resource", "resource_fields", "formula"
-    )
-    if resource_updates:
-        # Update resource with formula results
-        resource.update(resource_updates)
-        status_msg = "RESOURCE formulae processed..."
-        if resource.get("dpp_suggestions"):
-            resource["dpp_suggestions"]["STATUS"] = status_msg
-        else:
-            resource["dpp_suggestions"] = {"STATUS": status_msg}
-        logger.info(status_msg)
-
-    # THIRD, WE PROCESS THE SUGGESTIONS THAT SHOW UP IN THE SUGGESTION POPOVER
-    # we update the package dpp_suggestions field
-    # from which the Suggestion popover UI will pick it up
-    package_suggestions = formula_processor.process_formulae(
-        "package", "dataset_fields", "suggestion_formula"
-    )
-    if package_suggestions:
-        logger.trace(f"package_suggestions: {package_suggestions}")
-        revise_update_content = {"package": package_suggestions}
-        try:
-            status_msg = "PACKAGE suggestion formulae processed..."
-            revise_update_content["STATUS"] = status_msg
-            revised_package = dsu.revise_package(
-                package_id, update={"dpp_suggestions": revise_update_content}
-            )
-            logger.trace(f"Package after revising: {revised_package}")
-            package = revised_package
-            logger.info(status_msg)
-        except Exception as e:
-            logger.error(f"Error revising package: {str(e)}")
-
-    # Process resource suggestion formulae
-    # Note how we still update the PACKAGE dpp_suggestions field
-    # and there is NO RESOURCE dpp_suggestions field.
-    # This is because suggestion formulae are used to populate the
-    # suggestion popover DURING data entry/curation and suggestion formulae
-    # may update both package and resource fields.
-    resource_suggestions = formula_processor.process_formulae(
-        "resource", "resource_fields", "suggestion_formula"
-    )
-    if resource_suggestions:
-        logger.trace(f"resource_suggestions: {resource_suggestions}")
-        resource_name = resource["name"]
-        revise_update_content = {"resource": {resource_name: resource_suggestions}}
-
-        # Handle existing suggestions
-        if package.get("dpp_suggestions"):
-            package["dpp_suggestions"].update(revise_update_content["resource"])
-        else:
-            package["dpp_suggestions"] = revise_update_content["resource"]
-
-        try:
-            status_msg = "RESOURCE suggestion formulae processed..."
-            revise_update_content["STATUS"] = status_msg
-
-            revised_package = dsu.revise_package(
-                package_id, update={"dpp_suggestions": revise_update_content}
-            )
-            logger.trace(f"Package after revising: {revised_package}")
-            package = revised_package
-            logger.info(status_msg)
-        except Exception as e:
-            logger.error(f"Error revising package: {str(e)}")
-
-    # -------------------- FORMULAE PROCESSING DONE --------------------
-    formulae_elapsed = time.perf_counter() - formulae_start
-    logger.info(
-        f"FORMULAE PROCESSING DONE! Processed in {formulae_elapsed:,.2f} seconds."
-    )
-
-    # ============================================================
-    # UPDATE RESOURCE METADATA
-    # ============================================================
-    metadata_start = time.perf_counter()
-    logger.info("UPDATING RESOURCE METADATA...")
-
-    # --------------------- AUTO-ALIASING ------------------------
-    # aliases are human-readable, and make it easier to use than resource id hash
-    # when using the Datastore API and in SQL queries
-    alias = None
-    if conf.AUTO_ALIAS:
-        logger.info(f"AUTO-ALIASING. Auto-alias-unique: {conf.AUTO_ALIAS_UNIQUE} ...")
-        # get package info, so we can construct the alias
-        package = dsu.get_package(resource["package_id"])
-
-        resource_name = resource.get("name")
-        package_name = package.get("name")
-        owner_org = package.get("organization")
-        owner_org_name = ""
-        if owner_org:
-            owner_org_name = owner_org.get("name")
-        if resource_name and package_name and owner_org_name:
-            # we limit it to 55, so we still have space for sequence & stats suffix
-            # postgres max identifier length is 63
-            alias = f"{resource_name}-{package_name}-{owner_org_name}"[:55]
-            # if AUTO_ALIAS_UNIQUE is true, check if the alias already exist, if it does
-            # add a sequence suffix so the new alias can be created
-            cur.execute(
-                "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of",
-                (alias + "%",),
-            )
-            alias_query_result = cur.fetchone()
-            if alias_query_result:
-                alias_count = alias_query_result[0]
-                existing_alias_of = alias_query_result[1]
-            else:
-                alias_count = 0
-                existing_alias_of = ""
-            if conf.AUTO_ALIAS_UNIQUE and alias_count > 1:
-                alias_sequence = alias_count + 1
-                while True:
-                    # we do this, so we're certain the new alias does not exist
-                    # just in case they deleted an older alias with a lower sequence #
-                    alias = f"{alias}-{alias_sequence:03}"
-                    cur.execute(
-                        "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of;",
-                        (alias + "%",),
-                    )
-                    alias_exists = cur.fetchone()[0]
-                    if not alias_exists:
-                        break
-                    alias_sequence += 1
-            elif alias_count == 1:
-                logger.warning(
-                    f'Dropping existing alias "{alias}" for resource "{existing_alias_of}"...'
-                )
-                try:
-                    cur.execute(
-                        sql.SQL("DROP VIEW IF EXISTS {}").format(sql.Identifier(alias))
-                    )
-                except psycopg2.Error as e:
-                    logger.warning(f"Could not drop alias/view: {e}")
-
-        else:
-            logger.warning(
-                f"Cannot create alias: {resource_name}-{package_name}-{owner_org}"
-            )
-            alias = None
-
-    # -------- should we ADD_SUMMARY_STATS_RESOURCE? -------------
-    # by default, we only add summary stats if we're not doing a partial download
-    # (otherwise, you're summarizing the preview, not the whole file)
-    # That is, unless SUMMARY_STATS_WITH_PREVIEW is set to true
-    if conf.ADD_SUMMARY_STATS_RESOURCE or conf.SUMMARY_STATS_WITH_PREVIEW:
-        stats_resource_id = resource_id + "-stats"
-
-        # check if the stats already exist
-        existing_stats = dsu.datastore_resource_exists(stats_resource_id)
-        # Delete existing summary-stats before proceeding.
-        if existing_stats:
-            logger.info(f'Deleting existing summary stats "{stats_resource_id}".')
-
-            cur.execute(
-                "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;",
-                (stats_resource_id + "%",),
-            )
-            stats_alias_result = cur.fetchone()
-            if stats_alias_result:
-                existing_stats_alias_of = stats_alias_result[0]
-
-                dsu.delete_datastore_resource(existing_stats_alias_of)
-                dsu.delete_resource(existing_stats_alias_of)
-
-        stats_aliases = [stats_resource_id]
-        if conf.AUTO_ALIAS:
-            auto_alias_stats_id = alias + "-stats"
-            stats_aliases.append(auto_alias_stats_id)
-
-            # check if the summary-stats alias already exist. We need to do this as summary-stats resources
-            # may end up having the same alias if AUTO_ALIAS_UNIQUE is False, so we need to drop the
-            # existing summary stats-alias.
-            existing_alias_stats = dsu.datastore_resource_exists(auto_alias_stats_id)
-            # Delete existing auto-aliased summary-stats before proceeding.
-            if existing_alias_stats:
-                logger.info(
-                    f'Deleting existing alias summary stats "{auto_alias_stats_id}".'
-                )
-
-                cur.execute(
-                    "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;",
-                    (auto_alias_stats_id + "%",),
-                )
-                result = cur.fetchone()
-                if result:
-                    existing_stats_alias_of = result[0]
-
-                    dsu.delete_datastore_resource(existing_stats_alias_of)
-                    dsu.delete_resource(existing_stats_alias_of)
-
-        # run stats on stats CSV to get header names and infer data types
-        # we don't need summary statistics, so use the --typesonly option
-        try:
-            qsv_stats_stats = qsv.stats(
-                qsv_stats_csv,
-                typesonly=True,
-            )
-        except utils.JobError as e:
-            raise utils.JobError(f"Cannot run stats on CSV stats: {e}")
-
-        stats_stats = str(qsv_stats_stats.stdout).strip()
-        stats_stats_dict = [
-            dict(id=ele.split(",")[0], type=conf.TYPE_MAPPING[ele.split(",")[1]])
-            for idx, ele in enumerate(stats_stats.splitlines()[1:], 1)
-        ]
-
-        logger.info(f"stats_stats_dict: {stats_stats_dict}")
-
-        resource_name = resource.get("name")
-        stats_resource = {
-            "package_id": resource["package_id"],
-            "name": resource_name + " - Summary Statistics",
-            "format": "CSV",
-            "mimetype": "text/csv",
-        }
-        stats_response = dsu.send_resource_to_datastore(
-            stats_resource,
-            resource_id=None,
-            headers=stats_stats_dict,
-            records=None,
-            aliases=stats_aliases,
-            calculate_record_count=False,
-        )
-
-        logger.info(f"stats_response: {stats_response}")
-
-        new_stats_resource_id = stats_response["result"]["resource_id"]
-
-        # now COPY the stats to the datastore
-        col_names_list = [h["id"] for h in stats_stats_dict]
-        logger.info(
-            f'ADDING SUMMARY STATISTICS {col_names_list} in "{new_stats_resource_id}" with alias/es "{stats_aliases}"...'
-        )
-
-        column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list)
-
-        copy_sql = sql.SQL(
-            "COPY {} ({}) FROM STDIN "
-            "WITH (FORMAT CSV, "
-            "HEADER 1, ENCODING 'UTF8');"
-        ).format(
-            sql.Identifier(new_stats_resource_id),
-            column_names,
-        )
-
-        with open(qsv_stats_csv, "rb") as f:
-            try:
-                cur.copy_expert(copy_sql, f)
-            except psycopg2.Error as e:
-                raise utils.JobError(f"Postgres COPY failed: {e}")
-
-        stats_resource["id"] = new_stats_resource_id
-        stats_resource["summary_statistics"] = True
-        stats_resource["summary_of_resource"] = resource_id
-        dsu.update_resource(stats_resource)
-
-    cur.close()
-    raw_connection.commit()
-    raw_connection.close()
-
-    resource["datastore_active"] = True
-    resource["total_record_count"] = record_count
-    if conf.PREVIEW_ROWS < record_count or (conf.PREVIEW_ROWS > 0):
-        resource["preview"] = True
-        resource["preview_rows"] = copied_count
-    else:
-        resource["preview"] = False
-        resource["preview_rows"] = None
-        resource["partial_download"] = False
-    dsu.update_resource(resource)
-
-    # tell CKAN to calculate_record_count and set alias if set
-    dsu.send_resource_to_datastore(
-        resource=None,
-        resource_id=resource["id"],
-        headers=headers_dicts,
-        records=None,
-        aliases=alias,
-        calculate_record_count=True,
-    )
-
-    if alias:
-        logger.info(f'Created alias "{alias}" for "{resource_id}"...')
-
-    metadata_elapsed = time.perf_counter() - metadata_start
-    logger.info(
-        f"RESOURCE METADATA UPDATES DONE! Resource metadata updated in {metadata_elapsed:,.2f} seconds."
-    )
-
-    # -------------------- DONE --------------------
-    package.setdefault("dpp_suggestions", {})["STATUS"] = "DONE"
-    dsu.patch_package(package)
-
-    total_elapsed = time.perf_counter() - timer_start
-    newline_var = "\n"
-    end_msg = f"""
-    DATAPUSHER+ JOB DONE!
-      Download: {fetch_elapsed:,.2f}
-      Analysis: {analysis_elapsed:,.2f}{(newline_var + f"  PII Screening: {piiscreening_elapsed:,.2f}") if piiscreening_elapsed > 0 else ""}
-      COPYing: {copy_elapsed:,.2f}
-      Indexing: {index_elapsed:,.2f}
-      Formulae processing: {formulae_elapsed:,.2f}
-      Resource metadata updates: {metadata_elapsed:,.2f}
-    TOTAL ELAPSED TIME: {total_elapsed:,.2f}
-    """
-    logger.info(end_msg)
+"""
+DataPusher Plus Jobs Module - Backward Compatibility Wrapper
+
+This file provides backward compatibility for code importing from the original
+jobs.py module. The actual implementation has been refactored into a modular
+pipeline architecture located in the jobs/ subdirectory.
+
+For the refactored implementation, see:
+- jobs/pipeline.py - Main orchestration logic
+- jobs/context.py - Processing context state
+- jobs/stages/ - Individual pipeline stages
+
+Original implementation preserved in jobs_legacy.py for reference.
+"""
+
+# Import and re-export main entry points from the refactored pipeline
+from ckanext.datapusher_plus.jobs.pipeline import (
+    datapusher_plus_to_datastore,
+    push_to_datastore,
+    validate_input,
+    callback_datapusher_hook,
+)
+
+# Export all public functions
+__all__ = [
+    "datapusher_plus_to_datastore",
+    "push_to_datastore",
+    "validate_input",
+    "callback_datapusher_hook",
+]
diff --git a/ckanext/datapusher_plus/jobs/__init__.py b/ckanext/datapusher_plus/jobs/__init__.py
new file mode 100644
index 0000000..53f1ba0
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/__init__.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+"""
+DataPusher Plus Jobs Module
+
+This module contains the refactored job processing pipeline for DataPusher Plus.
+The monolithic jobs.py has been refactored into a clean pipeline architecture.
+"""
+
+# Re-export main entry points for backward compatibility
+from ckanext.datapusher_plus.jobs.pipeline import (
+    datapusher_plus_to_datastore,
+    push_to_datastore,
+)
+
+__all__ = [
+    "datapusher_plus_to_datastore",
+    "push_to_datastore",
+]
diff --git a/ckanext/datapusher_plus/jobs/context.py b/ckanext/datapusher_plus/jobs/context.py
new file mode 100644
index 0000000..d4f40ab
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/context.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+ProcessingContext for the DataPusher Plus pipeline.
+
+This class holds all state that is passed between pipeline stages.
+"""
+
+import logging
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass, field
+
+from ckanext.datapusher_plus.qsv_utils import QSVCommand
+
+
+@dataclass
+class ProcessingContext:
+    """
+    Context object that holds all state for the data processing pipeline.
+
+    This object is passed through each stage of the pipeline and is modified
+    by each stage to track progress and intermediate results.
+    """
+
+    # Task/Job identification
+    task_id: str
+    input: Dict[str, Any]
+    dry_run: bool = False
+
+    # Directories and file paths
+    temp_dir: str = ""
+    tmp: str = ""  # Current working CSV file (changes throughout pipeline)
+
+    # Logging and utilities
+    logger: Optional[logging.Logger] = None
+    qsv: Optional[QSVCommand] = None
+
+    # Resource information (from CKAN)
+    resource: Dict[str, Any] = field(default_factory=dict)
+    resource_id: str = ""
+    resource_url: str = ""
+    ckan_url: str = ""
+
+    # Headers and schema
+    headers_dicts: List[Dict[str, Any]] = field(default_factory=list)
+    headers: List[str] = field(default_factory=list)
+    original_header_dict: Dict[int, str] = field(default_factory=dict)
+
+    # Statistics and metadata
+    dataset_stats: Dict[str, Any] = field(default_factory=dict)
+    resource_fields_stats: Dict[str, Any] = field(default_factory=dict)
+    resource_fields_freqs: Dict[str, Any] = field(default_factory=dict)
+
+    # Datastore information
+    existing_info: Optional[Dict[str, Any]] = None
+    rows_to_copy: int = 0
+    copied_count: int = 0
+
+    # Timing information
+    timer_start: float = 0.0
+
+    # Processing flags and results
+    pii_found: bool = False
+    file_hash: str = ""
+    content_length: int = 0
+
+    # Intermediate files (for tracking)
+    qsv_index_file: str = ""
+
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """Convenience property to access input metadata."""
+        return self.input.get("metadata", {})
+
+    def update_tmp(self, new_tmp: str) -> None:
+        """
+        Update the current working CSV file path.
+
+        Args:
+            new_tmp: Path to the new temporary CSV file
+        """
+        self.tmp = new_tmp
+        self.logger.log(5, f"Updated tmp file to: {new_tmp}")  # TRACE level
+
+    def add_stat(self, key: str, value: Any) -> None:
+        """
+        Add a statistic to the dataset stats.
+
+        Args:
+            key: Statistics key
+            value: Statistics value
+        """
+        self.dataset_stats[key] = value
diff --git a/ckanext/datapusher_plus/jobs/pipeline.py b/ckanext/datapusher_plus/jobs/pipeline.py
new file mode 100644
index 0000000..41d3e52
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/pipeline.py
@@ -0,0 +1,304 @@
+# -*- coding: utf-8 -*-
+"""
+DataPusher Plus Pipeline
+
+Main orchestration logic for the refactored jobs module.
+"""
+
+import sys
+import time
+import logging
+import tempfile
+import traceback
+import sqlalchemy as sa
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from rq import get_current_job
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.helpers as dph
+import ckanext.datapusher_plus.config as conf
+import ckanext.datapusher_plus.datastore_utils as dsu
+from ckanext.datapusher_plus.logging_utils import TRACE
+from ckanext.datapusher_plus.qsv_utils import QSVCommand
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+from ckanext.datapusher_plus.jobs.stages.download import DownloadStage
+from ckanext.datapusher_plus.jobs.stages.format_converter import FormatConverterStage
+from ckanext.datapusher_plus.jobs.stages.validation import ValidationStage
+from ckanext.datapusher_plus.jobs.stages.analysis import AnalysisStage
+from ckanext.datapusher_plus.jobs.stages.database import DatabaseStage
+from ckanext.datapusher_plus.jobs.stages.indexing import IndexingStage
+from ckanext.datapusher_plus.jobs.stages.formula import FormulaStage
+from ckanext.datapusher_plus.jobs.stages.metadata import MetadataStage
+
+
+# Re-export validation functions for backward compatibility
+def validate_input(input: Dict[str, Any]) -> None:
+    """
+    Validates input dictionary contains required metadata and resource_id.
+
+    Args:
+        input: Input dictionary
+
+    Raises:
+        utils.JobError: If validation fails
+    """
+    if "metadata" not in input:
+        raise utils.JobError("Metadata missing")
+
+    data = input["metadata"]
+
+    if "resource_id" not in data:
+        raise utils.JobError("No id provided.")
+
+
+def callback_datapusher_hook(result_url: str, job_dict: Dict[str, Any]) -> bool:
+    """
+    Sends callback to CKAN with job status updates.
+
+    Args:
+        result_url: URL to send callback to
+        job_dict: Job status dictionary
+
+    Returns:
+        True if callback successful, False otherwise
+    """
+    import json
+    import requests
+
+    api_token = utils.get_dp_plus_user_apitoken()
+    headers: Dict[str, str] = {
+        "Content-Type": "application/json",
+        "Authorization": api_token,
+    }
+
+    try:
+        result = requests.post(
+            result_url,
+            data=json.dumps(job_dict, cls=utils.DatetimeJsonEncoder),
+            verify=conf.SSL_VERIFY,
+            headers=headers,
+        )
+    except requests.ConnectionError:
+        return False
+
+    return result.status_code == requests.codes.ok
+
+
+def datapusher_plus_to_datastore(input: Dict[str, Any]) -> Optional[str]:
+    """
+    Main function called by the datapusher_plus worker.
+
+    Errors are caught and logged in the database.
+
+    Args:
+        input: Dictionary containing metadata and other job information
+
+    Returns:
+        Optional[str]: Returns "error" if there was an error, None otherwise
+    """
+    job_dict: Dict[str, Any] = dict(metadata=input["metadata"], status="running")
+    callback_datapusher_hook(result_url=input["result_url"], job_dict=job_dict)
+
+    job_id = get_current_job().id
+    errored = False
+    try:
+        push_to_datastore(input, job_id)
+        job_dict["status"] = "complete"
+        dph.mark_job_as_completed(job_id, job_dict)
+    except utils.JobError as e:
+        dph.mark_job_as_errored(job_id, str(e))
+        job_dict["status"] = "error"
+        job_dict["error"] = str(e)
+        log = logging.getLogger(__name__)
+        log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}")
+        errored = True
+    except Exception as e:
+        dph.mark_job_as_errored(
+            job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e)
+        )
+        job_dict["status"] = "error"
+        job_dict["error"] = str(e)
+        log = logging.getLogger(__name__)
+        log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}")
+        errored = True
+    finally:
+        is_saved_ok = callback_datapusher_hook(
+            result_url=input["result_url"], job_dict=job_dict
+        )
+        errored = errored or not is_saved_ok
+    return "error" if errored else None
+
+
+def push_to_datastore(
+    input: Dict[str, Any], task_id: str, dry_run: bool = False
+) -> Optional[List[Dict[str, Any]]]:
+    """
+    Download and parse a resource push its data into CKAN's DataStore.
+
+    An asynchronous job that gets a resource from CKAN, downloads the
+    resource's data file and, if the data file has changed since last time,
+    parses the data and posts it into CKAN's DataStore.
+
+    Args:
+        input: Dictionary containing metadata and other job information
+        task_id: Unique identifier for the task
+        dry_run: If True, fetch and parse the data file but don't actually post the
+            data to the DataStore, instead return the data headers and rows that
+            would have been posted.
+
+    Returns:
+        Optional[List[Dict[str, Any]]]: If dry_run is True, returns the headers and rows
+            that would have been posted. Otherwise returns None.
+    """
+    # Ensure temporary files are removed after run
+    with tempfile.TemporaryDirectory() as temp_dir:
+        return _push_to_datastore(task_id, input, dry_run=dry_run, temp_dir=temp_dir)
+
+
+def _push_to_datastore(
+    task_id: str,
+    input: Dict[str, Any],
+    dry_run: bool = False,
+    temp_dir: Optional[str] = None,
+) -> Optional[List[Dict[str, Any]]]:
+    """
+    Internal function that processes the resource through the pipeline.
+
+    Args:
+        task_id: Unique task identifier
+        input: Input dictionary with metadata
+        dry_run: If True, don't actually push to datastore
+        temp_dir: Temporary directory path
+
+    Returns:
+        Optional list of headers dicts if dry_run is True
+    """
+    # Register job
+    try:
+        dph.add_pending_job(task_id, **input)
+    except sa.exc.IntegrityError:
+        raise utils.JobError("Job already exists.")
+
+    # Setup logging
+    handler = utils.StoringHandler(task_id, input)
+    logger = logging.getLogger(task_id)
+    logger.addHandler(handler)
+    logger.addHandler(logging.StreamHandler())
+
+    # Set log level
+    try:
+        log_level = getattr(logging, conf.UPLOAD_LOG_LEVEL.upper())
+    except AttributeError:
+        log_level = TRACE
+
+    logger.setLevel(logging.INFO)
+    logger.info(f"Setting log level to {logging.getLevelName(int(log_level))}")
+    logger.setLevel(log_level)
+
+    # Validate QSV binary exists
+    if not Path(conf.QSV_BIN).is_file():
+        raise utils.JobError(f"{conf.QSV_BIN} not found.")
+
+    # Initialize QSV
+    qsv = QSVCommand(logger=logger)
+
+    # Validate input
+    validate_input(input)
+
+    # Extract metadata
+    data = input["metadata"]
+    ckan_url = data["ckan_url"]
+    resource_id = data["resource_id"]
+
+    # Fetch resource
+    try:
+        resource = dsu.get_resource(resource_id)
+    except utils.JobError:
+        # Retry once after 5 seconds
+        time.sleep(5)
+        resource = dsu.get_resource(resource_id)
+
+    # Check if resource is datastore type
+    if resource.get("url_type") == "datastore":
+        logger.info("Dump files are managed with the Datastore API")
+        return
+
+    # Create processing context
+    context = ProcessingContext(
+        task_id=task_id,
+        input=input,
+        dry_run=dry_run,
+        temp_dir=temp_dir,
+        logger=logger,
+        qsv=qsv,
+        resource=resource,
+        resource_id=resource_id,
+        ckan_url=ckan_url,
+    )
+
+    # Create and run pipeline
+    pipeline = DataProcessingPipeline()
+    result_context = pipeline.execute(context)
+
+    # Return headers if dry run
+    if dry_run and result_context:
+        return result_context.headers_dicts
+
+    return None
+
+
+class DataProcessingPipeline:
+    """
+    Orchestrates the data processing pipeline through sequential stages.
+
+    Each stage processes the context and returns it (possibly modified).
+    If a stage returns None, the pipeline stops execution.
+    """
+
+    def __init__(self):
+        """Initialize the pipeline with all processing stages."""
+        self.stages = [
+            DownloadStage(),
+            FormatConverterStage(),
+            ValidationStage(),
+            AnalysisStage(),
+            DatabaseStage(),
+            IndexingStage(),
+            FormulaStage(),
+            MetadataStage(),
+        ]
+
+    def execute(self, context: ProcessingContext) -> Optional[ProcessingContext]:
+        """
+        Execute all pipeline stages sequentially.
+
+        Args:
+            context: Initial processing context
+
+        Returns:
+            Final processing context, or None if pipeline was aborted
+
+        Raises:
+            utils.JobError: If any stage fails
+        """
+        for stage in self.stages:
+            try:
+                context = stage(context)
+
+                # If stage returns None, stop pipeline
+                if context is None:
+                    context.logger.info(f"Pipeline stopped after stage: {stage.name}")
+                    return None
+
+            except utils.JobError:
+                # Re-raise JobErrors as-is
+                raise
+            except Exception as e:
+                # Wrap other exceptions
+                raise utils.JobError(
+                    f"Stage {stage.name} failed with error: {str(e)}"
+                ) from e
+
+        context.logger.info("Pipeline completed successfully!")
+        return context
diff --git a/ckanext/datapusher_plus/jobs/stages/__init__.py b/ckanext/datapusher_plus/jobs/stages/__init__.py
new file mode 100644
index 0000000..bce429a
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+"""
+Processing stages for the DataPusher Plus pipeline.
+
+Each stage handles a specific part of the ETL process.
+"""
+
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+
+__all__ = ["BaseStage"]
diff --git a/ckanext/datapusher_plus/jobs/stages/analysis.py b/ckanext/datapusher_plus/jobs/stages/analysis.py
new file mode 100644
index 0000000..0fbf522
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/analysis.py
@@ -0,0 +1,587 @@
+# -*- coding: utf-8 -*-
+"""
+Analysis stage for the DataPusher Plus pipeline.
+
+Handles type inference, statistics, frequency tables, and PII screening.
+"""
+
+import os
+import csv
+import time
+import json
+from typing import List, Dict, Any
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.config as conf
+import ckanext.datapusher_plus.datastore_utils as dsu
+from ckanext.datapusher_plus.pii_screening import screen_for_pii
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class AnalysisStage(BaseStage):
+    """
+    Analyzes CSV file to infer types and generate statistics.
+
+    Responsibilities:
+    - Extract and sanitize headers
+    - Infer data types
+    - Generate statistics
+    - Create frequency tables
+    - Generate preview if needed
+    - Normalize dates to RFC3339
+    - Screen for PII
+    """
+
+    def __init__(self):
+        super().__init__(name="Analysis")
+
+    def process(self, context: ProcessingContext) -> ProcessingContext:
+        """
+        Analyze CSV file and infer schema.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Updated context with schema information
+
+        Raises:
+            utils.JobError: If analysis fails
+        """
+        analysis_start = time.perf_counter()
+
+        # Extract headers and sanitize
+        original_header_dict = self._extract_headers(context)
+        self._sanitize_headers(context)
+
+        # Create index for faster operations
+        self._create_index(context)
+
+        # Get record count if not already available
+        record_count = context.dataset_stats.get("RECORD_COUNT")
+        if not record_count:
+            record_count = self._count_records(context)
+
+        # Check if empty
+        if record_count == 0:
+            context.logger.warning("Upload skipped as there are zero records.")
+            return None
+
+        # Log record count
+        unique_qualifier = "unique" if conf.DEDUP else ""
+        context.logger.info(f"{record_count} {unique_qualifier} records detected...")
+
+        # Infer types and generate statistics
+        headers_dicts, datetimecols_list, resource_fields_stats = (
+            self._infer_types_and_stats(context, original_header_dict)
+        )
+
+        # Store headers in context
+        context.headers_dicts = headers_dicts
+        context.headers = [h["id"] for h in headers_dicts]
+        context.original_header_dict = original_header_dict
+
+        # Generate frequency tables
+        resource_fields_freqs = self._generate_frequency_tables(context)
+
+        # Update field stats with frequency data
+        for field, freqs in resource_fields_freqs.items():
+            if field in resource_fields_stats:
+                resource_fields_stats[field]["freqs"] = freqs
+
+        # Store field stats in context for FormulaStage
+        context.resource_fields_stats = resource_fields_stats
+        context.resource_fields_freqs = resource_fields_freqs
+
+        # Generate preview if needed
+        context.rows_to_copy = record_count
+        if conf.PREVIEW_ROWS and record_count > conf.PREVIEW_ROWS:
+            context.rows_to_copy = self._generate_preview(context, record_count)
+
+        # Normalize dates to RFC3339
+        if datetimecols_list:
+            self._normalize_dates(context, datetimecols_list)
+
+        # Analysis complete
+        analysis_elapsed = time.perf_counter() - analysis_start
+        context.logger.info(
+            f"ANALYSIS DONE! Analyzed and prepped in {analysis_elapsed:,.2f} seconds."
+        )
+
+        # PII Screening
+        self._screen_pii(context)
+
+        # Remove index file
+        if context.qsv_index_file and os.path.exists(context.qsv_index_file):
+            os.remove(context.qsv_index_file)
+
+        return context
+
+    def _extract_headers(self, context: ProcessingContext) -> Dict[int, str]:
+        """
+        Extract original headers from CSV.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Dictionary mapping column index to original header name
+
+        Raises:
+            utils.JobError: If headers cannot be extracted
+        """
+        try:
+            qsv_headers = context.qsv.headers(context.tmp, just_names=True)
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot scan CSV headers: {e}")
+
+        original_headers = str(qsv_headers.stdout).strip()
+        original_header_dict = {
+            idx: ele for idx, ele in enumerate(original_headers.splitlines())
+        }
+        return original_header_dict
+
+    def _sanitize_headers(self, context: ProcessingContext) -> None:
+        """
+        Sanitize headers to be database-safe.
+
+        Args:
+            context: Processing context
+
+        Raises:
+            utils.JobError: If header sanitization fails
+        """
+        context.logger.info('Checking for "database-safe" header names...')
+
+        try:
+            qsv_safenames = context.qsv.safenames(
+                context.tmp,
+                mode="json",
+                reserved=conf.RESERVED_COLNAMES,
+                prefix=conf.UNSAFE_PREFIX,
+                uses_stdio=True,
+            )
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot scan CSV headers: {e}")
+
+        unsafe_json = json.loads(str(qsv_safenames.stdout))
+        unsafe_headers = unsafe_json["unsafe_headers"]
+
+        if unsafe_headers:
+            context.logger.info(
+                f'"{len(unsafe_headers)} unsafe" header names found '
+                f"({unsafe_headers}). Sanitizing...\""
+            )
+            qsv_safenames_csv = os.path.join(context.temp_dir, "qsv_safenames.csv")
+            context.qsv.safenames(
+                context.tmp, mode="conditional", output_file=qsv_safenames_csv
+            )
+            context.update_tmp(qsv_safenames_csv)
+        else:
+            context.logger.info("No unsafe header names found...")
+
+    def _create_index(self, context: ProcessingContext) -> None:
+        """
+        Create QSV index for faster operations.
+
+        Args:
+            context: Processing context
+
+        Raises:
+            utils.JobError: If index creation fails
+        """
+        try:
+            context.qsv_index_file = context.tmp + ".idx"
+            context.qsv.index(context.tmp)
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot index CSV: {e}")
+
+    def _count_records(self, context: ProcessingContext) -> int:
+        """
+        Count records in CSV.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Number of records
+
+        Raises:
+            utils.JobError: If counting fails
+        """
+        try:
+            qsv_count = context.qsv.count(context.tmp)
+            record_count = int(str(qsv_count.stdout).strip())
+            context.add_stat("RECORD_COUNT", record_count)
+            return record_count
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot count records in CSV: {e}")
+
+    def _infer_types_and_stats(
+        self, context: ProcessingContext, original_header_dict: Dict[int, str]
+    ) -> tuple[List[Dict[str, Any]], List[str], Dict[str, Any]]:
+        """
+        Infer data types and compile statistics.
+
+        Args:
+            context: Processing context
+            original_header_dict: Mapping of column index to original header
+
+        Returns:
+            Tuple of (headers_dicts, datetimecols_list, resource_fields_stats)
+
+        Raises:
+            utils.JobError: If type inference fails
+        """
+        context.logger.info("Inferring data types and compiling statistics...")
+
+        qsv_stats_csv = os.path.join(context.temp_dir, "qsv_stats.csv")
+
+        # Determine if we need special handling for spatial formats
+        spatial_format_flag = context.resource.get("format", "").upper() in [
+            "SHP",
+            "QGIS",
+            "GEOJSON",
+        ]
+
+        # Run qsv stats
+        try:
+            if spatial_format_flag:
+                env = os.environ.copy()
+                env["QSV_STATS_STRING_MAX_LENGTH"] = str(
+                    conf.QSV_STATS_STRING_MAX_LENGTH
+                )
+                context.qsv.stats(
+                    context.tmp,
+                    infer_dates=True,
+                    dates_whitelist=conf.QSV_DATES_WHITELIST,
+                    stats_jsonl=True,
+                    prefer_dmy=conf.PREFER_DMY,
+                    cardinality=bool(conf.AUTO_INDEX_THRESHOLD),
+                    summary_stats_options=conf.SUMMARY_STATS_OPTIONS,
+                    output_file=qsv_stats_csv,
+                    env=env,
+                )
+            else:
+                context.qsv.stats(
+                    context.tmp,
+                    infer_dates=True,
+                    dates_whitelist=conf.QSV_DATES_WHITELIST,
+                    stats_jsonl=True,
+                    prefer_dmy=conf.PREFER_DMY,
+                    cardinality=bool(conf.AUTO_INDEX_THRESHOLD),
+                    summary_stats_options=conf.SUMMARY_STATS_OPTIONS,
+                    output_file=qsv_stats_csv,
+                )
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot infer data types and compile statistics: {e}")
+
+        # Parse stats
+        return self._parse_stats(
+            context, qsv_stats_csv, original_header_dict
+        )
+
+    def _parse_stats(
+        self,
+        context: ProcessingContext,
+        stats_csv: str,
+        original_header_dict: Dict[int, str],
+    ) -> tuple[List[Dict[str, Any]], List[str], Dict[str, Any]]:
+        """
+        Parse statistics CSV and build headers dictionary.
+
+        Args:
+            context: Processing context
+            stats_csv: Path to stats CSV
+            original_header_dict: Mapping of column index to original header
+
+        Returns:
+            Tuple of (headers_dicts, datetimecols_list, resource_fields_stats)
+        """
+        headers = []
+        types = []
+        headers_min = []
+        headers_max = []
+        headers_cardinality = []
+        resource_fields_stats = {}
+
+        with open(stats_csv, mode="r") as inp:
+            reader = csv.DictReader(inp)
+            for row in reader:
+                # Add to stats dictionary
+                resource_fields_stats[row["field"]] = {"stats": row}
+
+                fr = {k: v for k, v in row.items()}
+                schema_field = fr.get("field", "Unnamed Column")
+                if schema_field.startswith("qsv_"):
+                    break
+
+                headers.append(schema_field)
+                types.append(fr.get("type", "String"))
+                headers_min.append(fr["min"])
+                headers_max.append(fr["max"])
+                if conf.AUTO_INDEX_THRESHOLD:
+                    headers_cardinality.append(int(fr.get("cardinality") or 0))
+
+        # Store cardinality for indexing stage
+        if conf.AUTO_INDEX_THRESHOLD:
+            context.add_stat("HEADERS_CARDINALITY", headers_cardinality)
+
+        # Check for existing datastore resource
+        existing = dsu.datastore_resource_exists(context.resource_id)
+        context.existing_info = None
+        if existing:
+            context.existing_info = dict(
+                (f["id"], f["info"]) for f in existing.get("fields", []) if "info" in f
+            )
+
+        # Override with types from Data Dictionary
+        if context.existing_info:
+            types = [
+                {
+                    "text": "String",
+                    "numeric": "Float",
+                    "timestamp": "DateTime",
+                }.get(context.existing_info.get(h, {}).get("type_override"), t)
+                for t, h in zip(types, headers)
+            ]
+
+        # Delete existing datastore resource
+        if existing:
+            context.logger.info(
+                f'Deleting existing resource "{context.resource_id}" from datastore.'
+            )
+            dsu.delete_datastore_resource(context.resource_id)
+
+        # Build headers_dicts
+        headers_dicts, datetimecols_list = self._build_headers_dicts(
+            context, headers, types, headers_min, headers_max, original_header_dict
+        )
+
+        context.logger.info(f"Determined headers and types: {headers_dicts}...")
+
+        return headers_dicts, datetimecols_list, resource_fields_stats
+
+    def _build_headers_dicts(
+        self,
+        context: ProcessingContext,
+        headers: List[str],
+        types: List[str],
+        headers_min: List[str],
+        headers_max: List[str],
+        original_header_dict: Dict[int, str],
+    ) -> tuple[List[Dict[str, Any]], List[str]]:
+        """
+        Build headers dictionaries with proper types.
+
+        Args:
+            context: Processing context
+            headers: List of header names
+            types: List of inferred types
+            headers_min: List of minimum values
+            headers_max: List of maximum values
+            original_header_dict: Mapping of column index to original header
+
+        Returns:
+            Tuple of (headers_dicts, datetimecols_list)
+        """
+        default_type = "String"
+        temp_headers_dicts = [
+            dict(
+                id=field[0],
+                type=conf.TYPE_MAPPING.get(
+                    str(field[1]) if field[1] else default_type, "text"
+                ),
+            )
+            for field in zip(headers, types)
+        ]
+
+        # Build final headers_dicts with smartint resolution
+        datetimecols_list = []
+        headers_dicts = []
+
+        for idx, header in enumerate(temp_headers_dicts):
+            if header["type"] == "smartint":
+                # Select best integer type based on min/max
+                if (
+                    int(headers_max[idx]) <= conf.POSTGRES_INT_MAX
+                    and int(headers_min[idx]) >= conf.POSTGRES_INT_MIN
+                ):
+                    header_type = "integer"
+                elif (
+                    int(headers_max[idx]) <= conf.POSTGRES_BIGINT_MAX
+                    and int(headers_min[idx]) >= conf.POSTGRES_BIGINT_MIN
+                ):
+                    header_type = "bigint"
+                else:
+                    header_type = "numeric"
+            else:
+                header_type = header["type"]
+
+            if header_type == "timestamp":
+                datetimecols_list.append(header["id"])
+
+            info_dict = dict(label=original_header_dict.get(idx, "Unnamed Column"))
+            headers_dicts.append(
+                dict(id=header["id"], type=header_type, info=info_dict)
+            )
+
+        # Preserve data dictionary from existing resource
+        if context.existing_info:
+            for h in headers_dicts:
+                if h["id"] in context.existing_info:
+                    h["info"] = context.existing_info[h["id"]]
+                    # Apply type overrides
+                    type_override = context.existing_info[h["id"]].get("type_override")
+                    if type_override in list(conf.TYPE_MAPPING.values()):
+                        h["type"] = type_override
+
+        return headers_dicts, datetimecols_list
+
+    def _generate_frequency_tables(
+        self, context: ProcessingContext
+    ) -> Dict[str, List[Dict[str, str]]]:
+        """
+        Generate frequency tables for each column.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Dictionary mapping field names to frequency data
+
+        Raises:
+            utils.JobError: If frequency table generation fails
+        """
+        qsv_freq_csv = os.path.join(context.temp_dir, "qsv_freq.csv")
+
+        try:
+            context.qsv.frequency(
+                context.tmp, limit=conf.QSV_FREQ_LIMIT, output_file=qsv_freq_csv
+            )
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot create a frequency table: {e}")
+
+        resource_fields_freqs = {}
+        try:
+            with open(qsv_freq_csv, "r") as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    field = row["field"]
+                    if field not in resource_fields_freqs:
+                        resource_fields_freqs[field] = []
+
+                    resource_fields_freqs[field].append(
+                        {
+                            "value": row["value"],
+                            "count": row["count"],
+                            "percentage": row["percentage"],
+                        }
+                    )
+            context.logger.log(5, f"Resource fields freqs: {resource_fields_freqs}")
+        except IOError as e:
+            raise utils.JobError(f"Could not open frequency CSV file: {e}")
+
+        return resource_fields_freqs
+
+    def _generate_preview(self, context: ProcessingContext, record_count: int) -> int:
+        """
+        Generate a preview slice of the data.
+
+        Args:
+            context: Processing context
+            record_count: Total number of records
+
+        Returns:
+            Number of rows in preview
+
+        Raises:
+            utils.JobError: If preview generation fails
+        """
+        qsv_slice_csv = os.path.join(context.temp_dir, "qsv_slice.csv")
+
+        if conf.PREVIEW_ROWS > 0:
+            # Positive: slice from beginning
+            context.logger.info(f"Preparing {conf.PREVIEW_ROWS}-row preview...")
+            try:
+                context.qsv.slice(
+                    context.tmp, length=conf.PREVIEW_ROWS, output_file=qsv_slice_csv
+                )
+            except utils.JobError as e:
+                raise utils.JobError(f"Cannot create a preview slice: {e}")
+            rows_to_copy = conf.PREVIEW_ROWS
+        else:
+            # Negative: slice from end
+            slice_len = abs(conf.PREVIEW_ROWS)
+            context.logger.info(f"Preparing {slice_len}-row preview from the end...")
+            try:
+                context.qsv.slice(
+                    context.tmp, start=-1, length=slice_len, output_file=qsv_slice_csv
+                )
+            except utils.JobError as e:
+                raise utils.JobError(f"Cannot create a preview slice from the end: {e}")
+            rows_to_copy = slice_len
+
+        context.update_tmp(qsv_slice_csv)
+        context.add_stat("PREVIEW_FILE_SIZE", os.path.getsize(qsv_slice_csv))
+        context.add_stat("PREVIEW_RECORD_COUNT", rows_to_copy)
+
+        return rows_to_copy
+
+    def _normalize_dates(
+        self, context: ProcessingContext, datetimecols_list: List[str]
+    ) -> None:
+        """
+        Normalize date columns to RFC3339 format.
+
+        Args:
+            context: Processing context
+            datetimecols_list: List of datetime column names
+
+        Raises:
+            utils.JobError: If date normalization fails
+        """
+        qsv_applydp_csv = os.path.join(context.temp_dir, "qsv_applydp.csv")
+        datecols = ",".join(datetimecols_list)
+
+        context.logger.info(
+            f'Formatting dates "{datecols}" to ISO 8601/RFC 3339 format '
+            f"with PREFER_DMY: {conf.PREFER_DMY}..."
+        )
+
+        try:
+            context.qsv.datefmt(
+                datecols,
+                context.tmp,
+                prefer_dmy=conf.PREFER_DMY,
+                output_file=qsv_applydp_csv,
+            )
+        except utils.JobError as e:
+            raise utils.JobError(f"Applydp error: {e}")
+
+        context.update_tmp(qsv_applydp_csv)
+
+    def _screen_pii(self, context: ProcessingContext) -> None:
+        """
+        Screen for Personally Identifiable Information.
+
+        Args:
+            context: Processing context
+        """
+        if conf.PII_SCREENING:
+            piiscreening_start = time.perf_counter()
+            context.pii_found = screen_for_pii(
+                context.tmp,
+                context.resource,
+                context.qsv,
+                context.temp_dir,
+                context.logger,
+            )
+            piiscreening_elapsed = time.perf_counter() - piiscreening_start
+            context.logger.info(
+                f"PII screening completed in {piiscreening_elapsed:,.2f} seconds"
+            )
+
+        context.add_stat("PII_SCREENING", conf.PII_SCREENING)
+        context.add_stat("PII_FOUND", context.pii_found)
diff --git a/ckanext/datapusher_plus/jobs/stages/base.py b/ckanext/datapusher_plus/jobs/stages/base.py
new file mode 100644
index 0000000..bed8efa
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/base.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+"""
+Base stage class for the DataPusher Plus pipeline.
+
+All pipeline stages inherit from this base class.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class BaseStage(ABC):
+    """
+    Abstract base class for all pipeline stages.
+
+    Each stage processes the context and returns it (possibly modified).
+    Stages can skip processing by returning None.
+    """
+
+    def __init__(self, name: Optional[str] = None):
+        """
+        Initialize the stage.
+
+        Args:
+            name: Optional name for the stage (defaults to class name)
+        """
+        self.name = name or self.__class__.__name__
+
+    @abstractmethod
+    def process(self, context: ProcessingContext) -> Optional[ProcessingContext]:
+        """
+        Process the context through this stage.
+
+        Args:
+            context: The processing context containing all state
+
+        Returns:
+            The modified context, or None to skip this stage
+
+        Raises:
+            utils.JobError: If processing fails
+        """
+        pass
+
+    def should_skip(self, context: ProcessingContext) -> bool:
+        """
+        Determine if this stage should be skipped.
+
+        Override this method to add conditional stage execution.
+
+        Args:
+            context: The processing context
+
+        Returns:
+            True if the stage should be skipped, False otherwise
+        """
+        return False
+
+    def __call__(self, context: ProcessingContext) -> Optional[ProcessingContext]:
+        """
+        Make the stage callable.
+
+        This allows stages to be used as: stage(context)
+
+        Args:
+            context: The processing context
+
+        Returns:
+            The modified context, or None to skip
+        """
+        if self.should_skip(context):
+            context.logger.info(f"Skipping stage: {self.name}")
+            return context
+
+        context.logger.info(f"Starting stage: {self.name}")
+        result = self.process(context)
+        context.logger.info(f"Completed stage: {self.name}")
+        return result
+
+    def __repr__(self) -> str:
+        """String representation of the stage."""
+        return f"<{self.name}>"
diff --git a/ckanext/datapusher_plus/jobs/stages/database.py b/ckanext/datapusher_plus/jobs/stages/database.py
new file mode 100644
index 0000000..e996be8
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/database.py
@@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+"""
+Database stage for the DataPusher Plus pipeline.
+
+Handles copying data to the PostgreSQL datastore.
+"""
+
+import time
+import psycopg2
+from psycopg2 import sql
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.config as conf
+import ckanext.datapusher_plus.datastore_utils as dsu
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class DatabaseStage(BaseStage):
+    """
+    Copies data to PostgreSQL datastore.
+
+    Responsibilities:
+    - Create empty datastore table with schema
+    - Use PostgreSQL COPY to efficiently load data
+    - Run VACUUM ANALYZE for performance
+    """
+
+    def __init__(self):
+        super().__init__(name="Database")
+
+    def should_skip(self, context: ProcessingContext) -> bool:
+        """Skip if in dry run mode."""
+        return context.dry_run
+
+    def process(self, context: ProcessingContext) -> ProcessingContext:
+        """
+        Copy data to datastore.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Updated context
+
+        Raises:
+            utils.JobError: If database operations fail
+        """
+        if context.dry_run:
+            context.logger.warning(
+                "Dry run only. Returning without copying to the Datastore..."
+            )
+            return context
+
+        copy_start = time.perf_counter()
+
+        if conf.PREVIEW_ROWS:
+            context.logger.info(
+                f"COPYING {context.rows_to_copy}-row preview to Datastore..."
+            )
+        else:
+            context.logger.info(
+                f"COPYING {context.rows_to_copy} rows to Datastore..."
+            )
+
+        # Create empty datastore table
+        self._create_datastore_table(context)
+
+        # Copy data using PostgreSQL COPY
+        copied_count = self._copy_data(context)
+
+        context.copied_count = copied_count
+
+        copy_elapsed = time.perf_counter() - copy_start
+        context.logger.info(
+            f'...copying done. Copied {copied_count} rows to "{context.resource_id}" '
+            f"in {copy_elapsed:,.2f} seconds."
+        )
+
+        return context
+
+    def _create_datastore_table(self, context: ProcessingContext) -> None:
+        """
+        Create empty datastore table with schema.
+
+        Args:
+            context: Processing context
+        """
+        dsu.send_resource_to_datastore(
+            resource=None,
+            resource_id=context.resource["id"],
+            headers=context.headers_dicts,
+            records=None,
+            aliases=None,
+            calculate_record_count=False,
+        )
+
+    def _copy_data(self, context: ProcessingContext) -> int:
+        """
+        Copy data to datastore using PostgreSQL COPY.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Number of rows copied
+
+        Raises:
+            utils.JobError: If COPY operation fails
+        """
+        try:
+            raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL)
+        except psycopg2.Error as e:
+            raise utils.JobError(f"Could not connect to the Datastore: {e}")
+
+        try:
+            cur = raw_connection.cursor()
+
+            # Truncate table for COPY FREEZE optimization
+            self._truncate_table(cur, context.resource_id)
+
+            # Prepare COPY SQL
+            col_names_list = [h["id"] for h in context.headers_dicts]
+            column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list)
+            copy_sql = sql.SQL(
+                "COPY {} ({}) FROM STDIN "
+                "WITH (FORMAT CSV, FREEZE 1, "
+                "HEADER 1, ENCODING 'UTF8');"
+            ).format(
+                sql.Identifier(context.resource_id),
+                column_names,
+            )
+
+            # Execute COPY
+            with open(context.tmp, "rb", conf.COPY_READBUFFER_SIZE) as f:
+                try:
+                    cur.copy_expert(copy_sql, f, size=conf.COPY_READBUFFER_SIZE)
+                except psycopg2.Error as e:
+                    raise utils.JobError(f"Postgres COPY failed: {e}")
+                copied_count = cur.rowcount
+
+            raw_connection.commit()
+
+            # VACUUM ANALYZE for performance
+            self._vacuum_analyze(raw_connection, context.resource_id)
+
+            return copied_count
+
+        finally:
+            if raw_connection:
+                raw_connection.close()
+
+    def _truncate_table(self, cursor: psycopg2.extensions.cursor, resource_id: str) -> None:
+        """
+        Truncate table to enable COPY FREEZE optimization.
+
+        Args:
+            cursor: Database cursor
+            resource_id: Resource ID (table name)
+        """
+        try:
+            cursor.execute(
+                sql.SQL("TRUNCATE TABLE {}").format(sql.Identifier(resource_id))
+            )
+        except psycopg2.Error as e:
+            # Non-fatal, log warning but continue
+            # (table might not exist yet)
+            pass
+
+    def _vacuum_analyze(
+        self, connection: psycopg2.extensions.connection, resource_id: str
+    ) -> None:
+        """
+        Run VACUUM ANALYZE on the table.
+
+        Args:
+            connection: Database connection
+            resource_id: Resource ID (table name)
+        """
+        # Set isolation level for VACUUM
+        connection.set_isolation_level(
+            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT
+        )
+
+        analyze_cur = connection.cursor()
+        try:
+            analyze_cur.execute(
+                sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id))
+            )
+        finally:
+            analyze_cur.close()
diff --git a/ckanext/datapusher_plus/jobs/stages/download.py b/ckanext/datapusher_plus/jobs/stages/download.py
new file mode 100644
index 0000000..14edd42
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/download.py
@@ -0,0 +1,379 @@
+# -*- coding: utf-8 -*-
+"""
+Download stage for the DataPusher Plus pipeline.
+
+Handles downloading resources, hash checking, and ZIP file extraction.
+"""
+
+import os
+import time
+import hashlib
+import mimetypes
+from typing import Dict, Any
+from urllib.parse import urlsplit, urlparse
+
+import requests
+from datasize import DataSize
+from dateutil.parser import parse as parsedate
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.helpers as dph
+import ckanext.datapusher_plus.config as conf
+from ckanext.datapusher_plus.job_exceptions import HTTPError
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class DownloadStage(BaseStage):
+    """
+    Downloads the resource file, validates it, and handles ZIP extraction.
+
+    Responsibilities:
+    - Validate resource URL scheme
+    - Download file with authentication if needed
+    - Calculate file hash for deduplication
+    - Check if file has changed since last upload
+    - Extract ZIP files if applicable
+    """
+
+    def __init__(self):
+        super().__init__(name="Download")
+
+    def process(self, context: ProcessingContext) -> ProcessingContext:
+        """
+        Download and validate the resource file.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Updated context with downloaded file information
+
+        Raises:
+            utils.JobError: If download fails or file is invalid
+        """
+        # Validate resource URL scheme
+        self._validate_url_scheme(context)
+
+        # Start timing
+        context.timer_start = time.perf_counter()
+
+        # Download the file
+        file_hash, length, resource_format, response_headers = self._download_file(context)
+
+        # Store file information
+        context.file_hash = file_hash
+        context.content_length = length
+        context.add_stat("ORIGINAL_FILE_SIZE", length)
+
+        # Check for file deduplication
+        if self._should_skip_upload(context, file_hash, response_headers):
+            context.logger.warning(
+                f"Upload skipped as the file hash hasn't changed: {file_hash}."
+            )
+            return None  # Signal to skip further processing
+
+        # Update resource hash
+        context.resource["hash"] = file_hash
+
+        # Log download completion
+        fetch_elapsed = time.perf_counter() - context.timer_start
+        context.logger.info(
+            f"Fetched {DataSize(length):.2MB} file in {fetch_elapsed:,.2f} seconds."
+        )
+
+        # Handle ZIP file extraction
+        self._handle_zip_file(context, resource_format)
+
+        return context
+
+    def _validate_url_scheme(self, context: ProcessingContext) -> None:
+        """
+        Validate that the resource URL uses an allowed scheme.
+
+        Args:
+            context: Processing context
+
+        Raises:
+            utils.JobError: If URL scheme is not allowed
+        """
+        context.resource_url = context.resource.get("url")
+        scheme = urlsplit(context.resource_url).scheme
+        if scheme not in ("http", "https", "ftp"):
+            raise utils.JobError("Only http, https, and ftp resources may be fetched.")
+
+    def _download_file(
+        self, context: ProcessingContext
+    ) -> tuple[str, int, str, Dict[str, Any]]:
+        """
+        Download the resource file and calculate its hash.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Tuple of (file_hash, file_length, resource_format, response_headers)
+
+        Raises:
+            HTTPError: If download fails
+            utils.JobError: If file is too large or format cannot be determined
+        """
+        resource_url = context.resource_url
+        context.logger.info(f"Fetching from: {resource_url}...")
+
+        # Prepare request headers
+        headers: Dict[str, str] = {}
+        if context.resource.get("url_type") == "upload":
+            # Authenticate for uploaded files
+            api_token = utils.get_dp_plus_user_apitoken()
+            headers["Authorization"] = api_token
+
+            # Rewrite URL if needed (for firewalls)
+            resource_url = self._rewrite_url_if_needed(
+                context, resource_url, context.ckan_url
+            )
+
+        # Configure request
+        kwargs: Dict[str, Any] = {
+            "headers": headers,
+            "timeout": conf.TIMEOUT,
+            "verify": conf.SSL_VERIFY,
+            "stream": True,
+        }
+        if conf.USE_PROXY:
+            kwargs["proxies"] = {
+                "http": conf.DOWNLOAD_PROXY,
+                "https": conf.DOWNLOAD_PROXY,
+            }
+
+        # Download file
+        try:
+            with requests.get(resource_url, **kwargs) as response:
+                response.raise_for_status()
+
+                # Get content info
+                cl = response.headers.get("content-length")
+                max_content_length = conf.MAX_CONTENT_LENGTH
+                ct = response.headers.get("content-type")
+
+                # Check size before download
+                if cl:
+                    try:
+                        if int(cl) > max_content_length and conf.PREVIEW_ROWS > 0:
+                            raise utils.JobError(
+                                f"Resource too large to download: {DataSize(int(cl)):.2MB} "
+                                f"> max ({DataSize(int(max_content_length)):.2MB})."
+                            )
+                    except ValueError:
+                        pass
+
+                # Determine file format
+                resource_format = self._determine_format(
+                    context, ct, response.headers
+                )
+
+                # Download and hash the file
+                file_hash, length = self._stream_download(
+                    context, resource_format, response, max_content_length
+                )
+
+                return file_hash, length, resource_format, dict(response.headers)
+
+        except requests.HTTPError as e:
+            raise HTTPError(
+                f"DataPusher+ received a bad HTTP response when trying to download "
+                f"the data file from {resource_url}. Status code: {e.response.status_code}, "
+                f"Response content: {e.response.content}",
+                status_code=e.response.status_code,
+                request_url=resource_url,
+                response=e.response.content,
+            )
+        except requests.RequestException as e:
+            raise HTTPError(
+                message=str(e),
+                status_code=None,
+                request_url=resource_url,
+                response=None,
+            )
+
+    def _rewrite_url_if_needed(
+        self, context: ProcessingContext, resource_url: str, ckan_url: str
+    ) -> str:
+        """
+        Rewrite URL if CKAN is behind a firewall.
+
+        Args:
+            context: Processing context
+            resource_url: Original resource URL
+            ckan_url: CKAN base URL
+
+        Returns:
+            Potentially rewritten URL
+        """
+        if not resource_url.startswith(ckan_url):
+            new_url = urlparse(resource_url)
+            rewrite_url = urlparse(ckan_url)
+            new_url = new_url._replace(
+                scheme=rewrite_url.scheme, netloc=rewrite_url.netloc
+            )
+            resource_url = new_url.geturl()
+            context.logger.info(f"Rewritten resource url to: {resource_url}")
+        return resource_url
+
+    def _determine_format(
+        self, context: ProcessingContext, content_type: str, headers: Dict[str, Any]
+    ) -> str:
+        """
+        Determine the file format from resource metadata or content type.
+
+        Args:
+            context: Processing context
+            content_type: HTTP content-type header
+            headers: Response headers
+
+        Returns:
+            File format string (uppercase)
+
+        Raises:
+            utils.JobError: If format cannot be determined
+        """
+        resource_format = context.resource.get("format", "").upper()
+
+        if not resource_format:
+            context.logger.info("File format: NOT SPECIFIED")
+            if content_type:
+                extension = mimetypes.guess_extension(content_type.split(";")[0])
+                if extension is None:
+                    raise utils.JobError(
+                        "Cannot determine format from mime type. Please specify format."
+                    )
+                resource_format = extension.lstrip(".").upper()
+                context.logger.info(f"Inferred file format: {resource_format}")
+            else:
+                raise utils.JobError(
+                    "Server did not return content-type. Please specify format."
+                )
+        else:
+            context.logger.info(f"File format: {resource_format}")
+
+        return resource_format
+
+    def _stream_download(
+        self,
+        context: ProcessingContext,
+        resource_format: str,
+        response: requests.Response,
+        max_content_length: int,
+    ) -> tuple[str, int]:
+        """
+        Stream download the file and calculate its hash.
+
+        Args:
+            context: Processing context
+            resource_format: File format extension
+            response: HTTP response object
+            max_content_length: Maximum allowed file size
+
+        Returns:
+            Tuple of (file_hash, file_length)
+
+        Raises:
+            utils.JobError: If file exceeds maximum size
+        """
+        tmp = os.path.join(context.temp_dir, "tmp." + resource_format)
+        context.update_tmp(tmp)
+
+        length = 0
+        # Using MD5 for file deduplication only (not for security)
+        m = hashlib.md5()  # DevSkim: ignore DS126858
+
+        # Log download start
+        cl = response.headers.get("content-length")
+        if cl:
+            context.logger.info(f"Downloading {DataSize(int(cl)):.2MB} file...")
+        else:
+            context.logger.info("Downloading file of unknown size...")
+
+        # Stream download
+        with open(tmp, "wb") as tmp_file:
+            for chunk in response.iter_content(conf.CHUNK_SIZE):
+                length += len(chunk)
+                if length > max_content_length and not conf.PREVIEW_ROWS:
+                    raise utils.JobError(
+                        f"Resource too large to process: {length} > max ({max_content_length})."
+                    )
+                tmp_file.write(chunk)
+                m.update(chunk)
+
+        return m.hexdigest(), length
+
+    def _should_skip_upload(
+        self,
+        context: ProcessingContext,
+        file_hash: str,
+        response_headers: Dict[str, Any],
+    ) -> bool:
+        """
+        Check if upload should be skipped due to unchanged file.
+
+        Args:
+            context: Processing context
+            file_hash: MD5 hash of downloaded file
+            response_headers: HTTP response headers
+
+        Returns:
+            True if upload should be skipped, False otherwise
+        """
+        # Check if resource metadata was updated
+        resource_updated = False
+        resource_last_modified = context.resource.get("last_modified")
+        if resource_last_modified:
+            resource_last_modified = parsedate(resource_last_modified)
+            file_last_modified = response_headers.get("last-modified")
+            if file_last_modified:
+                file_last_modified = parsedate(file_last_modified).replace(tzinfo=None)
+                if file_last_modified < resource_last_modified:
+                    resource_updated = True
+
+        # Skip if hash matches and not forced
+        metadata = context.metadata
+        return (
+            context.resource.get("hash") == file_hash
+            and not metadata.get("ignore_hash")
+            and not conf.IGNORE_FILE_HASH
+            and not resource_updated
+        )
+
+    def _handle_zip_file(self, context: ProcessingContext, resource_format: str) -> None:
+        """
+        Extract ZIP file if applicable.
+
+        Args:
+            context: Processing context
+            resource_format: File format
+
+        Returns:
+            None, but updates context.tmp if ZIP is extracted
+        """
+        if resource_format.upper() == "ZIP":
+            context.logger.info("Processing ZIP file...")
+
+            file_count, extracted_path, unzipped_format = dph.extract_zip_or_metadata(
+                context.tmp, context.temp_dir, context.logger
+            )
+
+            if not file_count:
+                context.logger.error("ZIP file invalid or no files found in ZIP file.")
+                return None
+
+            if file_count > 1:
+                context.logger.info(
+                    f"More than one file in the ZIP file ({file_count} files), "
+                    f"saving metadata..."
+                )
+            else:
+                context.logger.info(
+                    f"Extracted {unzipped_format} file: {extracted_path}"
+                )
+
+            context.update_tmp(extracted_path)
diff --git a/ckanext/datapusher_plus/jobs/stages/format_converter.py b/ckanext/datapusher_plus/jobs/stages/format_converter.py
new file mode 100644
index 0000000..269144c
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/format_converter.py
@@ -0,0 +1,382 @@
+# -*- coding: utf-8 -*-
+"""
+Format Converter stage for the DataPusher Plus pipeline.
+
+Handles conversion of various file formats to CSV.
+"""
+
+import os
+import uuid
+import subprocess
+from typing import Optional
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.config as conf
+import ckanext.datapusher_plus.spatial_helpers as sh
+import ckanext.datapusher_plus.datastore_utils as dsu
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class FormatConverterStage(BaseStage):
+    """
+    Converts various file formats to CSV.
+
+    Responsibilities:
+    - Convert spreadsheets (XLS, XLSX, ODS, etc.) to CSV
+    - Convert spatial formats (SHP, GEOJSON) to CSV
+    - Normalize CSV/TSV/TAB files
+    - Transcode to UTF-8
+    """
+
+    # Supported format types
+    SPREADSHEET_EXTENSIONS = ["XLS", "XLSX", "ODS", "XLSM", "XLSB"]
+    SPATIAL_FORMATS = ["SHP", "QGIS", "GEOJSON"]
+
+    def __init__(self):
+        super().__init__(name="FormatConverter")
+
+    def process(self, context: ProcessingContext) -> ProcessingContext:
+        """
+        Convert file format to CSV.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Updated context with CSV file
+
+        Raises:
+            utils.JobError: If conversion fails
+        """
+        resource_format = context.resource.get("format", "").upper()
+
+        # Check if file is a spreadsheet
+        if resource_format in self.SPREADSHEET_EXTENSIONS:
+            self._convert_spreadsheet(context, resource_format)
+        # Check if file is a spatial format
+        elif resource_format in self.SPATIAL_FORMATS:
+            self._convert_spatial_format(context, resource_format)
+        # Otherwise normalize as CSV/TSV/TAB
+        else:
+            self._normalize_csv(context, resource_format)
+
+        return context
+
+    def _convert_spreadsheet(
+        self, context: ProcessingContext, file_format: str
+    ) -> None:
+        """
+        Convert spreadsheet to CSV using qsv excel.
+
+        Args:
+            context: Processing context
+            file_format: Spreadsheet format (XLS, XLSX, etc.)
+
+        Raises:
+            utils.JobError: If conversion fails
+        """
+        default_excel_sheet = conf.DEFAULT_EXCEL_SHEET
+        context.logger.info(
+            f"Converting {file_format} sheet {default_excel_sheet} to CSV..."
+        )
+
+        # Create hardlink with proper extension
+        qsv_spreadsheet = os.path.join(
+            context.temp_dir, "qsv_spreadsheet." + file_format
+        )
+        os.link(context.tmp, qsv_spreadsheet)
+
+        # Run qsv excel to export to CSV
+        qsv_excel_csv = os.path.join(context.temp_dir, "qsv_excel.csv")
+        try:
+            qsv_excel = context.qsv.excel(
+                qsv_spreadsheet,
+                sheet=default_excel_sheet,
+                trim=True,
+                output_file=qsv_excel_csv,
+            )
+        except utils.JobError as e:
+            raise utils.JobError(
+                f"Upload aborted. Cannot export spreadsheet(?) to CSV: {e}"
+            )
+
+        excel_export_msg = qsv_excel.stderr
+        context.logger.info(f"{excel_export_msg}...")
+        context.update_tmp(qsv_excel_csv)
+
+    def _convert_spatial_format(
+        self, context: ProcessingContext, resource_format: str
+    ) -> None:
+        """
+        Convert spatial format to CSV.
+
+        Args:
+            context: Processing context
+            resource_format: Spatial format (SHP, GEOJSON, etc.)
+
+        Raises:
+            utils.JobError: If conversion fails
+        """
+        context.logger.info("SHAPEFILE or GEOJSON file detected...")
+
+        # Create unique spatial file
+        qsv_spatial_file = os.path.join(
+            context.temp_dir,
+            f"qsv_spatial_{uuid.uuid4()}.{resource_format}",
+        )
+        os.link(context.tmp, qsv_spatial_file)
+        qsv_spatial_csv = os.path.join(context.temp_dir, "qsv_spatial.csv")
+
+        simplification_failed = False
+
+        # Try spatial simplification if enabled
+        if conf.AUTO_SPATIAL_SIMPLIFICATION:
+            simplification_failed = not self._try_spatial_simplification(
+                context, qsv_spatial_file, qsv_spatial_csv, resource_format
+            )
+
+        # Fallback to qsv geoconvert if simplification failed or disabled
+        if not conf.AUTO_SPATIAL_SIMPLIFICATION or simplification_failed:
+            self._geoconvert(context, qsv_spatial_file, resource_format)
+
+    def _try_spatial_simplification(
+        self,
+        context: ProcessingContext,
+        spatial_file: str,
+        output_csv: str,
+        resource_format: str,
+    ) -> bool:
+        """
+        Try to convert and simplify spatial file.
+
+        Args:
+            context: Processing context
+            spatial_file: Path to spatial file
+            output_csv: Output CSV path
+            resource_format: Spatial format
+
+        Returns:
+            True if successful, False otherwise
+        """
+        context.logger.info(
+            f"Converting spatial file to CSV with a simplification relative "
+            f"tolerance of {conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE}..."
+        )
+
+        try:
+            success, error_message, bounds = sh.process_spatial_file(
+                spatial_file,
+                resource_format,
+                output_csv_path=output_csv,
+                tolerance=conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE,
+                task_logger=context.logger,
+            )
+
+            if success:
+                context.logger.info(
+                    "Spatial file successfully simplified and converted to CSV"
+                )
+                context.update_tmp(output_csv)
+                self._upload_simplified_resource(context, spatial_file, bounds)
+                return True
+            else:
+                context.logger.warning(
+                    f"Upload of simplified spatial file failed: {error_message}"
+                )
+                return False
+
+        except Exception as e:
+            context.logger.warning(f"Simplification and conversion failed: {str(e)}")
+            context.logger.warning(
+                f"Simplification failed. Using qsv geoconvert to convert to CSV, "
+                f"truncating large columns to {conf.QSV_STATS_STRING_MAX_LENGTH} characters..."
+            )
+            return False
+
+    def _upload_simplified_resource(
+        self, context: ProcessingContext, spatial_file: str, bounds: Optional[tuple]
+    ) -> None:
+        """
+        Upload simplified spatial resource to CKAN.
+
+        Args:
+            context: Processing context
+            spatial_file: Path to simplified spatial file
+            bounds: Bounding box coordinates (minx, miny, maxx, maxy)
+        """
+        resource = context.resource
+        simplified_resource_name = (
+            os.path.splitext(resource["name"])[0]
+            + "_simplified"
+            + os.path.splitext(resource["name"])[1]
+        )
+
+        existing_resource, existing_resource_id = dsu.resource_exists(
+            resource["package_id"], simplified_resource_name
+        )
+
+        if existing_resource:
+            context.logger.info("Simplified resource already exists. Replacing it...")
+            dsu.delete_resource(existing_resource_id)
+        else:
+            context.logger.info("Simplified resource does not exist. Uploading it...")
+
+        new_simplified_resource = {
+            "package_id": resource["package_id"],
+            "name": simplified_resource_name,
+            "url": "",
+            "format": resource["format"],
+            "hash": "",
+            "mimetype": resource["mimetype"],
+            "mimetype_inner": resource["mimetype_inner"],
+        }
+
+        # Add bounds information if available
+        if bounds:
+            minx, miny, maxx, maxy = bounds
+            new_simplified_resource.update(
+                {
+                    "dpp_spatial_extent": {
+                        "type": "BoundingBox",
+                        "coordinates": [[minx, miny], [maxx, maxy]],
+                    }
+                }
+            )
+            context.logger.info(
+                f"Added dpp_spatial_extent to resource metadata: {bounds}"
+            )
+
+        dsu.upload_resource(new_simplified_resource, spatial_file)
+        os.remove(spatial_file)
+
+    def _geoconvert(
+        self, context: ProcessingContext, spatial_file: str, resource_format: str
+    ) -> None:
+        """
+        Convert spatial file using qsv geoconvert.
+
+        Args:
+            context: Processing context
+            spatial_file: Path to spatial file
+            resource_format: Spatial format
+
+        Raises:
+            utils.JobError: If geoconvert fails
+        """
+        context.logger.info("Converting spatial file to CSV using qsv geoconvert...")
+
+        qsv_geoconvert_csv = os.path.join(context.temp_dir, "qsv_geoconvert.csv")
+        try:
+            context.qsv.geoconvert(
+                context.tmp,
+                resource_format,
+                "csv",
+                max_length=conf.QSV_STATS_STRING_MAX_LENGTH,
+                output_file=qsv_geoconvert_csv,
+            )
+        except utils.JobError as e:
+            raise utils.JobError(f"qsv geoconvert failed: {e}")
+
+        context.update_tmp(qsv_geoconvert_csv)
+        context.logger.info("Geoconverted successfully")
+
+    def _normalize_csv(self, context: ProcessingContext, resource_format: str) -> None:
+        """
+        Normalize CSV/TSV/TAB and transcode to UTF-8.
+
+        Args:
+            context: Processing context
+            resource_format: File format
+
+        Raises:
+            utils.JobError: If normalization fails
+        """
+        # Log appropriate message
+        if resource_format == "CSV":
+            context.logger.info(f"Normalizing/UTF-8 transcoding {resource_format}...")
+        else:
+            context.logger.info(
+                f"Normalizing/UTF-8 transcoding {resource_format} to CSV..."
+            )
+
+        qsv_input_csv = os.path.join(context.temp_dir, "qsv_input.csv")
+        qsv_input_utf_8_encoded_csv = os.path.join(
+            context.temp_dir, "qsv_input_utf_8_encoded.csv"
+        )
+
+        # Detect file encoding
+        encoding = self._detect_encoding(context)
+
+        # Re-encode to UTF-8 if needed
+        if encoding not in ("UTF-8", "ASCII"):
+            context.logger.info(f"File is not UTF-8 encoded. Re-encoding from {encoding} to UTF-8")
+            self._reencode_to_utf8(context, encoding, qsv_input_utf_8_encoded_csv)
+            source_file = qsv_input_utf_8_encoded_csv
+        else:
+            source_file = context.tmp
+
+        # Normalize using qsv input
+        try:
+            context.qsv.input(source_file, trim_headers=True, output_file=qsv_input_csv)
+        except utils.JobError as e:
+            raise utils.JobError(
+                f"Job aborted as the file cannot be normalized/transcoded: {e}."
+            )
+
+        context.update_tmp(qsv_input_csv)
+        context.logger.info("Normalized & transcoded...")
+
+    def _detect_encoding(self, context: ProcessingContext) -> str:
+        """
+        Detect file encoding using uchardet.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Detected encoding string
+
+        Raises:
+            utils.JobError: If encoding detection fails
+        """
+        try:
+            file_encoding = subprocess.run(
+                ["uchardet", context.tmp],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+            encoding = file_encoding.stdout.strip()
+            context.logger.info(f"Identified encoding of the file: {encoding}")
+            return encoding
+        except subprocess.CalledProcessError as e:
+            raise utils.JobError(f"Failed to detect file encoding: {e}")
+
+    def _reencode_to_utf8(
+        self, context: ProcessingContext, from_encoding: str, output_file: str
+    ) -> None:
+        """
+        Re-encode file to UTF-8 using iconv.
+
+        Args:
+            context: Processing context
+            from_encoding: Source encoding
+            output_file: Output file path
+
+        Raises:
+            utils.JobError: If re-encoding fails
+        """
+        try:
+            cmd = subprocess.run(
+                ["iconv", "-f", from_encoding, "-t", "UTF-8", context.tmp],
+                capture_output=True,
+                check=True,
+            )
+            with open(output_file, "wb") as f:
+                f.write(cmd.stdout)
+            context.logger.info("Successfully re-encoded to UTF-8")
+        except subprocess.CalledProcessError as e:
+            raise utils.JobError(
+                f"Job aborted as the file cannot be re-encoded to UTF-8. {e.stderr}"
+            )
diff --git a/ckanext/datapusher_plus/jobs/stages/formula.py b/ckanext/datapusher_plus/jobs/stages/formula.py
new file mode 100644
index 0000000..ad9bb53
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/formula.py
@@ -0,0 +1,428 @@
+# -*- coding: utf-8 -*-
+"""
+Formula stage for the DataPusher Plus pipeline.
+
+Handles DRUF (Data Resource Update Formulae) processing using Jinja2.
+"""
+
+import time
+from typing import Dict, Any, Optional
+
+import ckanext.datapusher_plus.datastore_utils as dsu
+import ckanext.datapusher_plus.jinja2_helpers as j2h
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class FormulaStage(BaseStage):
+    """
+    Processes DRUF formulae using Jinja2 templates.
+
+    This stage is optional and requires the ckanext-scheming extension.
+    If scheming is not available, the stage will be skipped gracefully.
+
+    Responsibilities:
+    - Fetch scheming YAML and package metadata
+    - Process package formulae (direct updates)
+    - Process resource formulae (direct updates)
+    - Process package suggestion formulae
+    - Process resource suggestion formulae
+
+    DRUF formulae come in two types:
+    1. "formula": Direct field updates (package/resource)
+    2. "suggestion_formula": Populates suggestion popovers for data entry
+    """
+
+    def __init__(self):
+        super().__init__(name="FormulaProcessing")
+
+    def should_skip(self, context: ProcessingContext) -> bool:
+        """
+        Skip this stage if ckanext-scheming is not enabled in ckan.plugins.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            True if scheming plugin is not enabled, False otherwise
+        """
+        try:
+            # Check if scheming is in the ckan.plugins configuration
+            import ckan.plugins.toolkit as tk
+
+            # Get the list of enabled plugins from config
+            plugins_config = tk.config.get('ckan.plugins', '')
+            enabled_plugins = [p.strip() for p in plugins_config.split()]
+
+            # Check for scheming-related plugins
+            scheming_plugins = ['scheming_datasets', 'scheming_groups',
+                              'scheming_organizations', 'scheming']
+
+            if any(plugin in enabled_plugins for plugin in scheming_plugins):
+                return False  # Scheming is enabled, don't skip
+
+            # Scheming not enabled in config
+            context.logger.info(
+                "Skipping FormulaProcessing stage - ckanext-scheming not enabled in ckan.plugins"
+            )
+            return True
+
+        except Exception as e:
+            # If we can't read config, log and skip
+            context.logger.warning(
+                f"Unable to check ckan.plugins configuration: {e}. "
+                "Skipping FormulaProcessing stage."
+            )
+            return True
+
+    def process(self, context: ProcessingContext) -> ProcessingContext:
+        """
+        Process DRUF formulae.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Updated context
+
+        Raises:
+            Returns early (None) if critical errors occur
+        """
+        formulae_start = time.perf_counter()
+
+        # Fetch scheming YAML and package
+        package_id = context.resource["package_id"]
+        try:
+            scheming_yaml, package = dsu.get_scheming_yaml(
+                package_id, scheming_yaml_type="dataset"
+            )
+        except Exception as e:
+            context.logger.warning(
+                f"Unable to fetch scheming YAML (scheming may not be configured): {e}"
+            )
+            context.logger.info("Skipping formula processing")
+            return context  # Skip formula processing but continue pipeline
+
+        # Validate scheming YAML
+        if not scheming_yaml or not isinstance(scheming_yaml, dict):
+            context.logger.info("No valid scheming YAML found, skipping formula processing")
+            return context
+
+        # Check for suggestion formulae
+        has_suggestion_formula = self._check_for_suggestion_formulae(scheming_yaml)
+
+        if has_suggestion_formula:
+            context.logger.info("Found suggestion formulae in schema")
+
+            # Validate and setup dpp_suggestions field
+            if not self._setup_dpp_suggestions(context, scheming_yaml, package):
+                return None  # Critical error, abort
+        else:
+            context.logger.info("No suggestion formulae found")
+
+        context.logger.log(5, f"package: {package}")
+
+        # Get resource field stats (need to retrieve from context or pass in)
+        resource_fields_stats = self._get_resource_field_stats(context)
+        resource_fields_freqs = self._get_resource_field_freqs(context)
+
+        # Initialize formula processor
+        formula_processor = j2h.FormulaProcessor(
+            scheming_yaml,
+            package,
+            context.resource,
+            resource_fields_stats,
+            resource_fields_freqs,
+            context.dataset_stats,
+            context.logger,
+        )
+
+        # Update status
+        package.setdefault("dpp_suggestions", {})[
+            "STATUS"
+        ] = "STARTING FORMULAE PROCESSING..."
+        dsu.patch_package(package)
+
+        # Clear LRU caches
+        self._clear_caches()
+
+        # Process package formulae (direct updates)
+        package = self._process_package_formulae(
+            context, formula_processor, package
+        )
+
+        # Process resource formulae (direct updates)
+        self._process_resource_formulae(context, formula_processor)
+
+        # Process package suggestion formulae
+        package = self._process_package_suggestions(
+            context, formula_processor, package, package_id
+        )
+
+        # Process resource suggestion formulae
+        package = self._process_resource_suggestions(
+            context, formula_processor, package, package_id
+        )
+
+        # Formulae processing complete
+        formulae_elapsed = time.perf_counter() - formulae_start
+        context.logger.info(
+            f"FORMULAE PROCESSING DONE! Processed in {formulae_elapsed:,.2f} seconds."
+        )
+
+        return context
+
+    def _check_for_suggestion_formulae(self, scheming_yaml: Dict[str, Any]) -> bool:
+        """
+        Check if scheming YAML contains suggestion formulae.
+
+        Args:
+            scheming_yaml: Scheming YAML dictionary
+
+        Returns:
+            True if suggestion formulae exist
+        """
+        return any(
+            isinstance(field, dict)
+            and any(key.startswith("suggestion_formula") for key in field.keys())
+            for field in scheming_yaml["dataset_fields"]
+        )
+
+    def _setup_dpp_suggestions(
+        self,
+        context: ProcessingContext,
+        scheming_yaml: Dict[str, Any],
+        package: Dict[str, Any],
+    ) -> bool:
+        """
+        Validate and setup dpp_suggestions field.
+
+        Args:
+            context: Processing context
+            scheming_yaml: Scheming YAML dictionary
+            package: Package dictionary
+
+        Returns:
+            True if setup successful, False if critical error
+        """
+        # Check if schema has dpp_suggestions field
+        schema_has_dpp_suggestions = any(
+            isinstance(field, dict) and field.get("field_name") == "dpp_suggestions"
+            for field in scheming_yaml["dataset_fields"]
+        )
+
+        if not schema_has_dpp_suggestions:
+            context.logger.error(
+                '"dpp_suggestions" field required but not found in your schema. '
+                "Ensure that your scheming.yaml file contains the "
+                '"dpp_suggestions" field as a json_object.'
+            )
+            return False
+        else:
+            context.logger.info('Found "dpp_suggestions" field in schema')
+
+        # Add dpp_suggestions to package if missing
+        if "dpp_suggestions" not in package:
+            context.logger.warning(
+                'Warning: "dpp_suggestions" field required to process Suggestion '
+                "Formulae is not found in this package. "
+                'Adding "dpp_suggestions" to package'
+            )
+
+            try:
+                package["dpp_suggestions"] = {}
+                dsu.patch_package(package)
+                context.logger.warning('"dpp_suggestions" field added to package')
+            except Exception as e:
+                context.logger.error(f'Error adding "dpp_suggestions" field {e}')
+                return False
+
+        return True
+
+    def _get_resource_field_stats(self, context: ProcessingContext) -> Dict[str, Any]:
+        """
+        Get resource field statistics from context.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Resource field statistics dictionary
+        """
+        return context.resource_fields_stats
+
+    def _get_resource_field_freqs(self, context: ProcessingContext) -> Dict[str, Any]:
+        """
+        Get resource field frequencies from context.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Resource field frequencies dictionary
+        """
+        return context.resource_fields_freqs
+
+    def _clear_caches(self) -> None:
+        """Clear LRU caches before processing formulae."""
+        dsu.datastore_search.cache_clear()
+        dsu.datastore_search_sql.cache_clear()
+        dsu.datastore_info.cache_clear()
+        dsu.index_exists.cache_clear()
+
+    def _process_package_formulae(
+        self,
+        context: ProcessingContext,
+        formula_processor: j2h.FormulaProcessor,
+        package: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        """
+        Process package formulae (direct updates).
+
+        Args:
+            context: Processing context
+            formula_processor: Formula processor instance
+            package: Package dictionary
+
+        Returns:
+            Updated package dictionary
+        """
+        package_updates = formula_processor.process_formulae(
+            "package", "dataset_fields", "formula"
+        )
+
+        if package_updates:
+            package.update(package_updates)
+            status_msg = "PACKAGE formulae processed..."
+            package["dpp_suggestions"]["STATUS"] = status_msg
+
+            try:
+                patched_package = dsu.patch_package(package)
+                context.logger.debug(f"Package after patching: {patched_package}")
+                package = patched_package
+                context.logger.info(status_msg)
+            except Exception as e:
+                context.logger.error(f"Error patching package: {str(e)}")
+
+        return package
+
+    def _process_resource_formulae(
+        self,
+        context: ProcessingContext,
+        formula_processor: j2h.FormulaProcessor,
+    ) -> None:
+        """
+        Process resource formulae (direct updates).
+
+        Args:
+            context: Processing context
+            formula_processor: Formula processor instance
+        """
+        resource_updates = formula_processor.process_formulae(
+            "resource", "resource_fields", "formula"
+        )
+
+        if resource_updates:
+            context.resource.update(resource_updates)
+            status_msg = "RESOURCE formulae processed..."
+
+            if context.resource.get("dpp_suggestions"):
+                context.resource["dpp_suggestions"]["STATUS"] = status_msg
+            else:
+                context.resource["dpp_suggestions"] = {"STATUS": status_msg}
+
+            context.logger.info(status_msg)
+
+    def _process_package_suggestions(
+        self,
+        context: ProcessingContext,
+        formula_processor: j2h.FormulaProcessor,
+        package: Dict[str, Any],
+        package_id: str,
+    ) -> Dict[str, Any]:
+        """
+        Process package suggestion formulae.
+
+        Args:
+            context: Processing context
+            formula_processor: Formula processor instance
+            package: Package dictionary
+            package_id: Package ID
+
+        Returns:
+            Updated package dictionary
+        """
+        package_suggestions = formula_processor.process_formulae(
+            "package", "dataset_fields", "suggestion_formula"
+        )
+
+        if package_suggestions:
+            context.logger.log(5, f"package_suggestions: {package_suggestions}")
+            revise_update_content = {"package": package_suggestions}
+
+            try:
+                status_msg = "PACKAGE suggestion formulae processed..."
+                revise_update_content["STATUS"] = status_msg
+                revised_package = dsu.revise_package(
+                    package_id, update={"dpp_suggestions": revise_update_content}
+                )
+                context.logger.log(5, f"Package after revising: {revised_package}")
+                package = revised_package
+                context.logger.info(status_msg)
+            except Exception as e:
+                context.logger.error(f"Error revising package: {str(e)}")
+
+        return package
+
+    def _process_resource_suggestions(
+        self,
+        context: ProcessingContext,
+        formula_processor: j2h.FormulaProcessor,
+        package: Dict[str, Any],
+        package_id: str,
+    ) -> Dict[str, Any]:
+        """
+        Process resource suggestion formulae.
+
+        Note: Updates PACKAGE dpp_suggestions field, not resource.
+
+        Args:
+            context: Processing context
+            formula_processor: Formula processor instance
+            package: Package dictionary
+            package_id: Package ID
+
+        Returns:
+            Updated package dictionary
+        """
+        resource_suggestions = formula_processor.process_formulae(
+            "resource", "resource_fields", "suggestion_formula"
+        )
+
+        if resource_suggestions:
+            context.logger.log(5, f"resource_suggestions: {resource_suggestions}")
+            resource_name = context.resource["name"]
+            revise_update_content = {
+                "resource": {resource_name: resource_suggestions}
+            }
+
+            # Handle existing suggestions
+            if package.get("dpp_suggestions"):
+                package["dpp_suggestions"].update(revise_update_content["resource"])
+            else:
+                package["dpp_suggestions"] = revise_update_content["resource"]
+
+            try:
+                status_msg = "RESOURCE suggestion formulae processed..."
+                revise_update_content["STATUS"] = status_msg
+
+                revised_package = dsu.revise_package(
+                    package_id, update={"dpp_suggestions": revise_update_content}
+                )
+                context.logger.log(5, f"Package after revising: {revised_package}")
+                package = revised_package
+                context.logger.info(status_msg)
+            except Exception as e:
+                context.logger.error(f"Error revising package: {str(e)}")
+
+        return package
diff --git a/ckanext/datapusher_plus/jobs/stages/indexing.py b/ckanext/datapusher_plus/jobs/stages/indexing.py
new file mode 100644
index 0000000..7884535
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/indexing.py
@@ -0,0 +1,289 @@
+# -*- coding: utf-8 -*-
+"""
+Indexing stage for the DataPusher Plus pipeline.
+
+Handles automatic index creation based on cardinality and configuration.
+"""
+
+import time
+import psycopg2
+from psycopg2 import sql
+from typing import List
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.config as conf
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class IndexingStage(BaseStage):
+    """
+    Creates database indexes automatically based on cardinality.
+
+    Responsibilities:
+    - Create unique indexes for columns with all unique values
+    - Create regular indexes for low-cardinality columns
+    - Create indexes on date columns if configured
+    - Optimize table with VACUUM ANALYZE
+    """
+
+    def __init__(self):
+        super().__init__(name="Indexing")
+
+    def should_skip(self, context: ProcessingContext) -> bool:
+        """
+        Skip indexing if not configured.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            True if indexing should be skipped
+        """
+        # Get datetime columns (need to check if analysis stage stored this)
+        datetimecols_list = self._get_datetime_columns(context)
+
+        return not (
+            conf.AUTO_INDEX_THRESHOLD
+            or (conf.AUTO_INDEX_DATES and datetimecols_list)
+            or conf.AUTO_UNIQUE_INDEX
+        )
+
+    def process(self, context: ProcessingContext) -> ProcessingContext:
+        """
+        Create database indexes.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Updated context
+
+        Raises:
+            utils.JobError: If indexing fails
+        """
+        index_start = time.perf_counter()
+
+        # Get datetime columns
+        datetimecols_list = self._get_datetime_columns(context)
+
+        context.logger.info(
+            f"AUTO-INDEXING. Auto-index threshold: {conf.AUTO_INDEX_THRESHOLD} "
+            f"unique value/s. Auto-unique index: {conf.AUTO_UNIQUE_INDEX} "
+            f"Auto-index dates: {conf.AUTO_INDEX_DATES} ..."
+        )
+
+        # Get cardinality data
+        headers_cardinality = context.dataset_stats.get("HEADERS_CARDINALITY", [])
+        record_count = context.dataset_stats.get("RECORD_COUNT", 0)
+
+        # Adjust threshold if set to -1 (index all columns)
+        auto_index_threshold = conf.AUTO_INDEX_THRESHOLD
+        if auto_index_threshold == -1:
+            auto_index_threshold = record_count
+
+        # Create indexes
+        index_count = self._create_indexes(
+            context,
+            headers_cardinality,
+            datetimecols_list,
+            record_count,
+            auto_index_threshold,
+        )
+
+        index_elapsed = time.perf_counter() - index_start
+        context.logger.info(
+            f'...indexing/vacuum analysis done. Indexed {index_count} column/s '
+            f'in "{context.resource_id}" in {index_elapsed:,.2f} seconds.'
+        )
+
+        return context
+
+    def _get_datetime_columns(self, context: ProcessingContext) -> List[str]:
+        """
+        Extract datetime column names from headers_dicts.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            List of datetime column names
+        """
+        datetimecols_list = []
+        for header in context.headers_dicts:
+            if header.get("type") == "timestamp":
+                datetimecols_list.append(header["id"])
+        return datetimecols_list
+
+    def _create_indexes(
+        self,
+        context: ProcessingContext,
+        headers_cardinality: List[int],
+        datetimecols_list: List[str],
+        record_count: int,
+        auto_index_threshold: int,
+    ) -> int:
+        """
+        Create indexes on appropriate columns.
+
+        Args:
+            context: Processing context
+            headers_cardinality: List of cardinality values for each column
+            datetimecols_list: List of datetime column names
+            record_count: Total number of records
+            auto_index_threshold: Cardinality threshold for indexing
+
+        Returns:
+            Number of indexes created
+
+        Raises:
+            utils.JobError: If database connection fails
+        """
+        try:
+            raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL)
+        except psycopg2.Error as e:
+            raise utils.JobError(f"Could not connect to the Datastore: {e}")
+
+        try:
+            index_cur = raw_connection.cursor()
+            index_count = 0
+
+            # Iterate through columns
+            for idx, cardinality in enumerate(headers_cardinality):
+                if idx >= len(context.headers):
+                    break
+
+                curr_col = context.headers[idx]
+
+                # Check if we should create a unique index
+                if cardinality == record_count and conf.AUTO_UNIQUE_INDEX:
+                    if self._create_unique_index(
+                        context, index_cur, curr_col, cardinality
+                    ):
+                        index_count += 1
+
+                # Check if we should create a regular index
+                elif cardinality <= auto_index_threshold or (
+                    conf.AUTO_INDEX_DATES and (curr_col in datetimecols_list)
+                ):
+                    if self._create_regular_index(
+                        context, index_cur, curr_col, cardinality, datetimecols_list
+                    ):
+                        index_count += 1
+
+            index_cur.close()
+            raw_connection.commit()
+
+            # VACUUM ANALYZE to optimize indexes
+            self._vacuum_analyze(context, raw_connection)
+
+            return index_count
+
+        finally:
+            if raw_connection:
+                raw_connection.close()
+
+    def _create_unique_index(
+        self,
+        context: ProcessingContext,
+        cursor: psycopg2.extensions.cursor,
+        column: str,
+        cardinality: int,
+    ) -> bool:
+        """
+        Create a unique index on a column.
+
+        Args:
+            context: Processing context
+            cursor: Database cursor
+            column: Column name
+            cardinality: Column cardinality
+
+        Returns:
+            True if index was created successfully, False otherwise
+        """
+        if conf.PREVIEW_ROWS > 0:
+            unique_value_count = min(conf.PREVIEW_ROWS, cardinality)
+        else:
+            unique_value_count = cardinality
+
+        context.logger.info(
+            f'Creating UNIQUE index on "{column}" for {unique_value_count} unique values...'
+        )
+
+        try:
+            cursor.execute(
+                sql.SQL("CREATE UNIQUE INDEX ON {} ({})").format(
+                    sql.Identifier(context.resource_id),
+                    sql.Identifier(column),
+                )
+            )
+            return True
+        except psycopg2.Error as e:
+            context.logger.warning(f'Could not CREATE UNIQUE INDEX on "{column}": {e}')
+            return False
+
+    def _create_regular_index(
+        self,
+        context: ProcessingContext,
+        cursor: psycopg2.extensions.cursor,
+        column: str,
+        cardinality: int,
+        datetimecols_list: List[str],
+    ) -> bool:
+        """
+        Create a regular index on a column.
+
+        Args:
+            context: Processing context
+            cursor: Database cursor
+            column: Column name
+            cardinality: Column cardinality
+            datetimecols_list: List of datetime columns
+
+        Returns:
+            True if index was created successfully, False otherwise
+        """
+        if column in datetimecols_list:
+            context.logger.info(
+                f'Creating index on "{column}" date column for {cardinality} unique value/s...'
+            )
+        else:
+            context.logger.info(
+                f'Creating index on "{column}" for {cardinality} unique value/s...'
+            )
+
+        try:
+            cursor.execute(
+                sql.SQL("CREATE INDEX ON {} ({})").format(
+                    sql.Identifier(context.resource_id),
+                    sql.Identifier(column),
+                )
+            )
+            return True
+        except psycopg2.Error as e:
+            context.logger.warning(f'Could not CREATE INDEX on "{column}": {e}')
+            return False
+
+    def _vacuum_analyze(
+        self, context: ProcessingContext, connection: psycopg2.extensions.connection
+    ) -> None:
+        """
+        Run VACUUM ANALYZE to optimize indexes.
+
+        Args:
+            context: Processing context
+            connection: Database connection
+        """
+        context.logger.info("Vacuum Analyzing table to optimize indices...")
+
+        connection.set_isolation_level(
+            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT
+        )
+        analyze_cur = connection.cursor()
+        try:
+            analyze_cur.execute(
+                sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(context.resource_id))
+            )
+        finally:
+            analyze_cur.close()
diff --git a/ckanext/datapusher_plus/jobs/stages/metadata.py b/ckanext/datapusher_plus/jobs/stages/metadata.py
new file mode 100644
index 0000000..e24ec5c
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/metadata.py
@@ -0,0 +1,391 @@
+# -*- coding: utf-8 -*-
+"""
+Metadata stage for the DataPusher Plus pipeline.
+
+Handles resource metadata updates, auto-aliasing, and summary statistics.
+"""
+
+import os
+import time
+import psycopg2
+from psycopg2 import sql
+from typing import Optional
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.config as conf
+import ckanext.datapusher_plus.datastore_utils as dsu
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class MetadataStage(BaseStage):
+    """
+    Updates resource metadata and creates aliases.
+
+    Responsibilities:
+    - Create auto-aliases for resources
+    - Create summary statistics resource
+    - Update resource metadata (datastore_active, record counts, etc.)
+    - Set final aliases and calculate record counts
+    """
+
+    def __init__(self):
+        super().__init__(name="MetadataUpdate")
+
+    def process(self, context: ProcessingContext) -> ProcessingContext:
+        """
+        Update resource metadata.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Updated context
+
+        Raises:
+            utils.JobError: If metadata update fails
+        """
+        metadata_start = time.perf_counter()
+        context.logger.info("UPDATING RESOURCE METADATA...")
+
+        # Connect to database for aliasing operations
+        try:
+            raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL)
+        except psycopg2.Error as e:
+            raise utils.JobError(f"Could not connect to the Datastore: {e}")
+
+        try:
+            cur = raw_connection.cursor()
+
+            # Create auto-alias if configured
+            alias = self._create_auto_alias(context, cur)
+
+            # Create summary statistics resource if configured
+            self._create_summary_stats_resource(context, cur)
+
+            # Commit database changes
+            cur.close()
+            raw_connection.commit()
+
+        finally:
+            if raw_connection:
+                raw_connection.close()
+
+        # Update resource metadata
+        self._update_resource_metadata(context)
+
+        # Set alias and calculate record count
+        dsu.send_resource_to_datastore(
+            resource=None,
+            resource_id=context.resource["id"],
+            headers=context.headers_dicts,
+            records=None,
+            aliases=alias,
+            calculate_record_count=True,
+        )
+
+        if alias:
+            context.logger.info(f'Created alias "{alias}" for "{context.resource_id}"...')
+
+        metadata_elapsed = time.perf_counter() - metadata_start
+        context.logger.info(
+            f"RESOURCE METADATA UPDATES DONE! Resource metadata updated in "
+            f"{metadata_elapsed:,.2f} seconds."
+        )
+
+        # Mark as done
+        package = dsu.get_package(context.resource["package_id"])
+        package.setdefault("dpp_suggestions", {})["STATUS"] = "DONE"
+        dsu.patch_package(package)
+
+        return context
+
+    def _create_auto_alias(
+        self, context: ProcessingContext, cursor: psycopg2.extensions.cursor
+    ) -> Optional[str]:
+        """
+        Create auto-alias for the resource.
+
+        Args:
+            context: Processing context
+            cursor: Database cursor
+
+        Returns:
+            Alias name if created, None otherwise
+        """
+        if not conf.AUTO_ALIAS:
+            return None
+
+        context.logger.info(
+            f"AUTO-ALIASING. Auto-alias-unique: {conf.AUTO_ALIAS_UNIQUE} ..."
+        )
+
+        # Get package info for alias construction
+        package = dsu.get_package(context.resource["package_id"])
+
+        resource_name = context.resource.get("name")
+        package_name = package.get("name")
+        owner_org = package.get("organization")
+        owner_org_name = owner_org.get("name") if owner_org else ""
+
+        if not (resource_name and package_name and owner_org_name):
+            context.logger.warning(
+                f"Cannot create alias: {resource_name}-{package_name}-{owner_org}"
+            )
+            return None
+
+        # Create base alias (limited to 55 chars for sequence/stats suffix)
+        alias = f"{resource_name}-{package_name}-{owner_org_name}"[:55]
+
+        # Check if alias exists
+        cursor.execute(
+            "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of",
+            (alias + "%",),
+        )
+        alias_query_result = cursor.fetchone()
+
+        if alias_query_result:
+            alias_count = alias_query_result[0]
+            existing_alias_of = alias_query_result[1]
+        else:
+            alias_count = 0
+            existing_alias_of = ""
+
+        # Handle alias uniqueness
+        if conf.AUTO_ALIAS_UNIQUE and alias_count > 1:
+            alias_sequence = alias_count + 1
+            while True:
+                # Find next available sequence number
+                alias = f"{alias}-{alias_sequence:03}"
+                cursor.execute(
+                    "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of;",
+                    (alias + "%",),
+                )
+                result = cursor.fetchone()
+                alias_exists = result[0] if result else 0
+                if not alias_exists:
+                    break
+                alias_sequence += 1
+        elif alias_count == 1:
+            # Drop existing alias
+            context.logger.warning(
+                f'Dropping existing alias "{alias}" for resource "{existing_alias_of}"...'
+            )
+            try:
+                cursor.execute(
+                    sql.SQL("DROP VIEW IF EXISTS {}").format(sql.Identifier(alias))
+                )
+            except psycopg2.Error as e:
+                context.logger.warning(f"Could not drop alias/view: {e}")
+
+        return alias
+
+    def _create_summary_stats_resource(
+        self, context: ProcessingContext, cursor: psycopg2.extensions.cursor
+    ) -> None:
+        """
+        Create summary statistics resource.
+
+        Args:
+            context: Processing context
+            cursor: Database cursor
+
+        Raises:
+            utils.JobError: If stats resource creation fails
+        """
+        # Check if we should create summary stats
+        if not (conf.ADD_SUMMARY_STATS_RESOURCE or conf.SUMMARY_STATS_WITH_PREVIEW):
+            return
+
+        record_count = context.dataset_stats.get("RECORD_COUNT", 0)
+        if not (conf.PREVIEW_ROWS == 0 or conf.SUMMARY_STATS_WITH_PREVIEW):
+            # Skip if preview mode and not explicitly enabled
+            return
+
+        stats_resource_id = context.resource_id + "-stats"
+
+        # Delete existing stats resource
+        self._delete_existing_stats(context, cursor, stats_resource_id)
+
+        # Prepare aliases for stats resource
+        stats_aliases = [stats_resource_id]
+        if conf.AUTO_ALIAS:
+            # Get base alias from main resource
+            package = dsu.get_package(context.resource["package_id"])
+            resource_name = context.resource.get("name")
+            package_name = package.get("name")
+            owner_org = package.get("organization")
+            owner_org_name = owner_org.get("name") if owner_org else ""
+            base_alias = f"{resource_name}-{package_name}-{owner_org_name}"[:55]
+
+            auto_alias_stats_id = base_alias + "-stats"
+            stats_aliases.append(auto_alias_stats_id)
+
+            # Delete existing auto-aliased stats
+            self._delete_existing_stats(context, cursor, auto_alias_stats_id)
+
+        # Infer stats schema
+        qsv_stats_csv = os.path.join(context.temp_dir, "qsv_stats.csv")
+        stats_stats_dict = self._infer_stats_schema(context, qsv_stats_csv)
+
+        # Create stats resource
+        resource_name = context.resource.get("name")
+        stats_resource = {
+            "package_id": context.resource["package_id"],
+            "name": resource_name + " - Summary Statistics",
+            "format": "CSV",
+            "mimetype": "text/csv",
+        }
+
+        stats_response = dsu.send_resource_to_datastore(
+            stats_resource,
+            resource_id=None,
+            headers=stats_stats_dict,
+            records=None,
+            aliases=stats_aliases,
+            calculate_record_count=False,
+        )
+
+        context.logger.info(f"stats_response: {stats_response}")
+
+        new_stats_resource_id = stats_response["result"]["resource_id"]
+
+        # Copy stats data to datastore
+        self._copy_stats_to_datastore(
+            context, cursor, qsv_stats_csv, new_stats_resource_id, stats_stats_dict
+        )
+
+        # Update stats resource metadata
+        stats_resource["id"] = new_stats_resource_id
+        stats_resource["summary_statistics"] = True
+        stats_resource["summary_of_resource"] = context.resource_id
+        dsu.update_resource(stats_resource)
+
+    def _delete_existing_stats(
+        self,
+        context: ProcessingContext,
+        cursor: psycopg2.extensions.cursor,
+        stats_id: str,
+    ) -> None:
+        """
+        Delete existing stats resource if it exists.
+
+        Args:
+            context: Processing context
+            cursor: Database cursor
+            stats_id: Stats resource ID or alias
+        """
+        existing_stats = dsu.datastore_resource_exists(stats_id)
+        if existing_stats:
+            context.logger.info(f'Deleting existing summary stats "{stats_id}".')
+
+            cursor.execute(
+                "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;",
+                (stats_id + "%",),
+            )
+            stats_alias_result = cursor.fetchone()
+
+            if stats_alias_result:
+                existing_stats_alias_of = stats_alias_result[0]
+                dsu.delete_datastore_resource(existing_stats_alias_of)
+                dsu.delete_resource(existing_stats_alias_of)
+
+    def _infer_stats_schema(
+        self, context: ProcessingContext, qsv_stats_csv: str
+    ) -> list:
+        """
+        Infer schema for stats CSV.
+
+        Args:
+            context: Processing context
+            qsv_stats_csv: Path to stats CSV
+
+        Returns:
+            List of stats field dictionaries
+
+        Raises:
+            utils.JobError: If schema inference fails
+        """
+        try:
+            qsv_stats_stats = context.qsv.stats(
+                qsv_stats_csv,
+                typesonly=True,
+            )
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot run stats on CSV stats: {e}")
+
+        stats_stats = str(qsv_stats_stats.stdout).strip()
+        stats_stats_dict = [
+            dict(id=ele.split(",")[0], type=conf.TYPE_MAPPING[ele.split(",")[1]])
+            for idx, ele in enumerate(stats_stats.splitlines()[1:], 1)
+        ]
+
+        context.logger.info(f"stats_stats_dict: {stats_stats_dict}")
+
+        return stats_stats_dict
+
+    def _copy_stats_to_datastore(
+        self,
+        context: ProcessingContext,
+        cursor: psycopg2.extensions.cursor,
+        qsv_stats_csv: str,
+        stats_resource_id: str,
+        stats_stats_dict: list,
+    ) -> None:
+        """
+        Copy stats data to datastore.
+
+        Args:
+            context: Processing context
+            cursor: Database cursor
+            qsv_stats_csv: Path to stats CSV
+            stats_resource_id: Stats resource ID
+            stats_stats_dict: Stats schema
+
+        Raises:
+            utils.JobError: If COPY fails
+        """
+        col_names_list = [h["id"] for h in stats_stats_dict]
+        stats_aliases_str = f"{stats_resource_id}, ..."
+
+        context.logger.info(
+            f'ADDING SUMMARY STATISTICS {col_names_list} in "{stats_resource_id}" '
+            f'with alias/es "{stats_aliases_str}"...'
+        )
+
+        column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list)
+
+        copy_sql = sql.SQL(
+            "COPY {} ({}) FROM STDIN WITH (FORMAT CSV, HEADER 1, ENCODING 'UTF8');"
+        ).format(
+            sql.Identifier(stats_resource_id),
+            column_names,
+        )
+
+        with open(qsv_stats_csv, "rb") as f:
+            try:
+                cursor.copy_expert(copy_sql, f)
+            except psycopg2.Error as e:
+                raise utils.JobError(f"Postgres COPY failed: {e}")
+
+    def _update_resource_metadata(self, context: ProcessingContext) -> None:
+        """
+        Update resource metadata fields.
+
+        Args:
+            context: Processing context
+        """
+        record_count = context.dataset_stats.get("RECORD_COUNT", 0)
+
+        context.resource["datastore_active"] = True
+        context.resource["total_record_count"] = record_count
+
+        if conf.PREVIEW_ROWS < record_count or (conf.PREVIEW_ROWS > 0):
+            context.resource["preview"] = True
+            context.resource["preview_rows"] = context.copied_count
+        else:
+            context.resource["preview"] = False
+            context.resource["preview_rows"] = None
+            context.resource["partial_download"] = False
+
+        dsu.update_resource(context.resource)
diff --git a/ckanext/datapusher_plus/jobs/stages/validation.py b/ckanext/datapusher_plus/jobs/stages/validation.py
new file mode 100644
index 0000000..7f64018
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/stages/validation.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+"""
+Validation stage for the DataPusher Plus pipeline.
+
+Handles CSV validation and deduplication.
+"""
+
+import os
+import json
+import subprocess
+from typing import Dict, Any, Union
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.config as conf
+from ckanext.datapusher_plus.jobs.stages.base import BaseStage
+from ckanext.datapusher_plus.jobs.context import ProcessingContext
+
+
+class ValidationStage(BaseStage):
+    """
+    Validates CSV file and performs deduplication.
+
+    Responsibilities:
+    - Validate CSV against RFC4180 standard
+    - Check if CSV is sorted
+    - Count duplicates
+    - Deduplicate if needed
+    """
+
+    def __init__(self):
+        super().__init__(name="Validation")
+
+    def process(self, context: ProcessingContext) -> ProcessingContext:
+        """
+        Validate CSV and deduplicate if needed.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Updated context
+
+        Raises:
+            utils.JobError: If validation fails
+        """
+        # Validate CSV
+        self._validate_csv(context)
+
+        # Check for duplicates and sort order
+        dupe_count = 0
+        if conf.SORT_AND_DUPE_CHECK or conf.DEDUP:
+            dupe_count = self._check_duplicates(context)
+
+        # Deduplicate if needed
+        if conf.DEDUP and dupe_count > 0:
+            self._deduplicate(context, dupe_count)
+        else:
+            context.add_stat("DEDUPED", False)
+
+        return context
+
+    def _validate_csv(self, context: ProcessingContext) -> None:
+        """
+        Validate CSV against RFC4180 standard.
+
+        Args:
+            context: Processing context
+
+        Raises:
+            utils.JobError: If CSV is invalid
+        """
+        context.logger.info("Validating CSV...")
+        try:
+            context.qsv.validate(context.tmp)
+        except utils.JobError as e:
+            raise utils.JobError(f"qsv validate failed: {e}")
+
+        context.logger.info("Well-formed, valid CSV file confirmed...")
+
+    def _check_duplicates(self, context: ProcessingContext) -> int:
+        """
+        Check for duplicates and if CSV is sorted.
+
+        Args:
+            context: Processing context
+
+        Returns:
+            Number of duplicates found
+
+        Raises:
+            utils.JobError: If sortcheck fails
+        """
+        context.logger.info("Checking for duplicates and if the CSV is sorted...")
+
+        try:
+            qsv_sortcheck = context.qsv.sortcheck(
+                context.tmp, json_output=True, uses_stdio=True
+            )
+        except utils.JobError as e:
+            raise utils.JobError(
+                f"Failed to check if CSV is sorted and has duplicates: {e}"
+            )
+
+        # Parse sortcheck output
+        sortcheck_json = self._parse_sortcheck_output(qsv_sortcheck)
+
+        # Extract and store statistics
+        is_sorted = bool(sortcheck_json.get("sorted", False))
+        record_count = int(sortcheck_json.get("record_count", 0))
+        unsorted_breaks = int(sortcheck_json.get("unsorted_breaks", 0))
+        dupe_count = int(sortcheck_json.get("dupe_count", 0))
+
+        context.add_stat("IS_SORTED", is_sorted)
+        context.add_stat("RECORD_COUNT", record_count)
+        context.add_stat("UNSORTED_BREAKS", unsorted_breaks)
+        context.add_stat("DUPE_COUNT", dupe_count)
+
+        # Format log message
+        sortcheck_msg = f"Sorted: {is_sorted}; Unsorted breaks: {unsorted_breaks:,}"
+        if is_sorted and dupe_count > 0:
+            sortcheck_msg = f"{sortcheck_msg}; Duplicates: {dupe_count:,}"
+
+        context.logger.info(sortcheck_msg)
+
+        return dupe_count
+
+    def _parse_sortcheck_output(
+        self, qsv_sortcheck: Union[subprocess.CompletedProcess, Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """
+        Parse sortcheck JSON output.
+
+        Args:
+            qsv_sortcheck: Output from qsv sortcheck command
+
+        Returns:
+            Parsed JSON dictionary
+
+        Raises:
+            utils.JobError: If parsing fails
+        """
+        try:
+            # Handle both subprocess.CompletedProcess and dict outputs
+            stdout_content = (
+                qsv_sortcheck.stdout
+                if hasattr(qsv_sortcheck, "stdout")
+                else qsv_sortcheck.get("stdout")
+            )
+            sortcheck_json = json.loads(str(stdout_content))
+        except (json.JSONDecodeError, AttributeError) as e:
+            raise utils.JobError(f"Failed to parse sortcheck JSON output: {e}")
+
+        # Validate required fields
+        try:
+            # Ensure numeric values are valid
+            int(sortcheck_json.get("record_count", 0))
+            int(sortcheck_json.get("unsorted_breaks", 0))
+            int(sortcheck_json.get("dupe_count", 0))
+        except (ValueError, TypeError) as e:
+            raise utils.JobError(f"Invalid numeric value in sortcheck output: {e}")
+
+        return sortcheck_json
+
+    def _deduplicate(self, context: ProcessingContext, dupe_count: int) -> None:
+        """
+        Deduplicate the CSV file.
+
+        Args:
+            context: Processing context
+            dupe_count: Number of duplicates found
+
+        Raises:
+            utils.JobError: If deduplication fails
+        """
+        qsv_dedup_csv = os.path.join(context.temp_dir, "qsv_dedup.csv")
+        context.logger.info(f"{dupe_count} duplicate rows found. Deduping...")
+
+        try:
+            context.qsv.extdedup(context.tmp, qsv_dedup_csv)
+        except utils.JobError as e:
+            raise utils.JobError(f"Check for duplicates error: {e}")
+
+        context.add_stat("DEDUPED", True)
+        context.update_tmp(qsv_dedup_csv)
+        context.logger.info(f"Deduped CSV saved to {qsv_dedup_csv}")
diff --git a/ckanext/datapusher_plus/jobs/utils/__init__.py b/ckanext/datapusher_plus/jobs/utils/__init__.py
new file mode 100644
index 0000000..4248b9e
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs/utils/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+"""
+Utility modules for the DataPusher Plus job processing pipeline.
+"""
+
+__all__ = []
diff --git a/ckanext/datapusher_plus/jobs_legacy.py b/ckanext/datapusher_plus/jobs_legacy.py
new file mode 100644
index 0000000..23f57bd
--- /dev/null
+++ b/ckanext/datapusher_plus/jobs_legacy.py
@@ -0,0 +1,1623 @@
+# -*- coding: utf-8 -*-
+# flake8: noqa: E501
+
+# Standard library imports
+import csv
+import hashlib
+import locale
+import mimetypes
+import os
+import subprocess
+import tempfile
+import time
+from urllib.parse import urlsplit, urlparse
+import logging
+import uuid
+import sys
+import json
+import requests
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+
+# Third-party imports
+import psycopg2
+from psycopg2 import sql
+from datasize import DataSize
+from dateutil.parser import parse as parsedate
+import traceback
+import sqlalchemy as sa
+from rq import get_current_job
+
+import ckanext.datapusher_plus.utils as utils
+import ckanext.datapusher_plus.helpers as dph
+import ckanext.datapusher_plus.jinja2_helpers as j2h
+from ckanext.datapusher_plus.job_exceptions import HTTPError
+import ckanext.datapusher_plus.config as conf
+import ckanext.datapusher_plus.spatial_helpers as sh
+import ckanext.datapusher_plus.datastore_utils as dsu
+from ckanext.datapusher_plus.logging_utils import TRACE
+from ckanext.datapusher_plus.qsv_utils import QSVCommand
+from ckanext.datapusher_plus.pii_screening import screen_for_pii
+
+if locale.getdefaultlocale()[0]:
+    lang, encoding = locale.getdefaultlocale()
+    locale.setlocale(locale.LC_ALL, locale=(lang, encoding))
+else:
+    locale.setlocale(locale.LC_ALL, "")
+
+
+def validate_input(input: Dict[str, Any]) -> None:
+    # Especially validate metadata which is provided by the user
+    if "metadata" not in input:
+        raise utils.JobError("Metadata missing")
+
+    data = input["metadata"]
+
+    if "resource_id" not in data:
+        raise utils.JobError("No id provided.")
+
+
+def callback_datapusher_hook(result_url: str, job_dict: Dict[str, Any]) -> bool:
+    api_token = utils.get_dp_plus_user_apitoken()
+    headers: Dict[str, str] = {
+        "Content-Type": "application/json",
+        "Authorization": api_token,
+    }
+
+    try:
+        result = requests.post(
+            result_url,
+            data=json.dumps(job_dict, cls=utils.DatetimeJsonEncoder),
+            verify=conf.SSL_VERIFY,
+            headers=headers,
+        )
+    except requests.ConnectionError:
+        return False
+
+    return result.status_code == requests.codes.ok
+
+
+def datapusher_plus_to_datastore(input: Dict[str, Any]) -> Optional[str]:
+    """
+    This is the main function that is called by the datapusher_plus worker
+
+    Errors are caught and logged in the database
+
+    Args:
+        input: Dictionary containing metadata and other job information
+
+    Returns:
+        Optional[str]: Returns "error" if there was an error, None otherwise
+    """
+    job_dict: Dict[str, Any] = dict(metadata=input["metadata"], status="running")
+    callback_datapusher_hook(result_url=input["result_url"], job_dict=job_dict)
+
+    job_id = get_current_job().id
+    errored = False
+    try:
+        push_to_datastore(input, job_id)
+        job_dict["status"] = "complete"
+        dph.mark_job_as_completed(job_id, job_dict)
+    except utils.JobError as e:
+        dph.mark_job_as_errored(job_id, str(e))
+        job_dict["status"] = "error"
+        job_dict["error"] = str(e)
+        log = logging.getLogger(__name__)
+        log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}")
+        errored = True
+    except Exception as e:
+        dph.mark_job_as_errored(
+            job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e)
+        )
+        job_dict["status"] = "error"
+        job_dict["error"] = str(e)
+        log = logging.getLogger(__name__)
+        log.error(f"Datapusher Plus error: {e}, {traceback.format_exc()}")
+        errored = True
+    finally:
+        # job_dict is defined in datapusher_hook's docstring
+        is_saved_ok = callback_datapusher_hook(
+            result_url=input["result_url"], job_dict=job_dict
+        )
+        errored = errored or not is_saved_ok
+    return "error" if errored else None
+
+
+def push_to_datastore(
+    input: Dict[str, Any], task_id: str, dry_run: bool = False
+) -> Optional[List[Dict[str, Any]]]:
+    """Download and parse a resource push its data into CKAN's DataStore.
+
+    An asynchronous job that gets a resource from CKAN, downloads the
+    resource's data file and, if the data file has changed since last time,
+    parses the data and posts it into CKAN's DataStore.
+
+    Args:
+        input: Dictionary containing metadata and other job information
+        task_id: Unique identifier for the task
+        dry_run: If True, fetch and parse the data file but don't actually post the
+            data to the DataStore, instead return the data headers and rows that
+            would have been posted.
+
+    Returns:
+        Optional[List[Dict[str, Any]]]: If dry_run is True, returns the headers and rows
+            that would have been posted. Otherwise returns None.
+    """
+    # Ensure temporary files are removed after run
+    with tempfile.TemporaryDirectory() as temp_dir:
+        return _push_to_datastore(task_id, input, dry_run=dry_run, temp_dir=temp_dir)
+
+
+def _push_to_datastore(
+    task_id: str,
+    input: Dict[str, Any],
+    dry_run: bool = False,
+    temp_dir: Optional[str] = None,
+) -> Optional[List[Dict[str, Any]]]:
+    # add job to dn  (datapusher_plus_jobs table)
+    try:
+        dph.add_pending_job(task_id, **input)
+    except sa.exc.IntegrityError:
+        raise utils.JobError("Job already exists.")
+    handler = utils.StoringHandler(task_id, input)
+    logger = logging.getLogger(task_id)
+    logger.addHandler(handler)
+
+    # also show logs on stderr
+    logger.addHandler(logging.StreamHandler())
+
+    # set the log level to the config upload_log_level
+    try:
+        log_level = getattr(logging, conf.UPLOAD_LOG_LEVEL.upper())
+    except AttributeError:
+        # fallback to our custom TRACE level
+        log_level = TRACE
+
+    # set the log level to the config upload_log_level
+    logger.setLevel(logging.INFO)
+    logger.info(f"Setting log level to {logging.getLevelName(int(log_level))}")
+    logger.setLevel(log_level)
+
+    # check if conf.QSV_BIN exists
+    if not Path(conf.QSV_BIN).is_file():
+        raise utils.JobError(f"{conf.QSV_BIN} not found.")
+
+    # Initialize QSVCommand
+    qsv = QSVCommand(logger=logger)
+
+    validate_input(input)
+
+    data = input["metadata"]
+
+    ckan_url = data["ckan_url"]
+    resource_id = data["resource_id"]
+    try:
+        resource = dsu.get_resource(resource_id)
+    except utils.JobError:
+        # try again in 5 seconds just incase CKAN is slow at adding resource
+        time.sleep(5)
+        resource = dsu.get_resource(resource_id)
+
+    # check if the resource url_type is a datastore
+    if resource.get("url_type") == "datastore":
+        logger.info("Dump files are managed with the Datastore API")
+        return
+
+    # check scheme
+    resource_url = resource.get("url")
+    scheme = urlsplit(resource_url).scheme
+    if scheme not in ("http", "https", "ftp"):
+        raise utils.JobError("Only http, https, and ftp resources may be fetched.")
+
+    # ==========================================================================
+    # DOWNLOAD
+    # ==========================================================================
+    timer_start = time.perf_counter()
+    dataset_stats = {}
+
+    # fetch the resource data
+    logger.info(f"Fetching from: {resource_url}...")
+    headers: Dict[str, str] = {}
+    if resource.get("url_type") == "upload":
+        # If this is an uploaded file to CKAN, authenticate the request,
+        # otherwise we won't get file from private resources
+        api_token = utils.get_dp_plus_user_apitoken()
+        headers["Authorization"] = api_token
+
+        # If the ckan_url differs from this url, rewrite this url to the ckan
+        # url. This can be useful if ckan is behind a firewall.
+        if not resource_url.startswith(ckan_url):
+            new_url = urlparse(resource_url)
+            rewrite_url = urlparse(ckan_url)
+            new_url = new_url._replace(
+                scheme=rewrite_url.scheme, netloc=rewrite_url.netloc
+            )
+            resource_url = new_url.geturl()
+            logger.info(f"Rewritten resource url to: {resource_url}")
+
+    try:
+        kwargs: Dict[str, Any] = {
+            "headers": headers,
+            "timeout": conf.TIMEOUT,
+            "verify": conf.SSL_VERIFY,
+            "stream": True,
+        }
+        if conf.USE_PROXY:
+            kwargs["proxies"] = {
+                "http": conf.DOWNLOAD_PROXY,
+                "https": conf.DOWNLOAD_PROXY,
+            }
+        with requests.get(resource_url, **kwargs) as response:
+            response.raise_for_status()
+
+            cl = response.headers.get("content-length")
+            max_content_length = conf.MAX_CONTENT_LENGTH
+            ct = response.headers.get("content-type")
+
+            try:
+                if cl and int(cl) > max_content_length and conf.PREVIEW_ROWS > 0:
+                    raise utils.JobError(
+                        f"Resource too large to download: {DataSize(int(cl)):.2MB} > max ({DataSize(int(max_content_length)):.2MB})."
+                    )
+            except ValueError:
+                pass
+
+            resource_format = resource.get("format").upper()
+
+            # if format was not specified, try to get it from mime type
+            if not resource_format:
+                logger.info("File format: NOT SPECIFIED")
+                # if we have a mime type, get the file extension from the response header
+                if ct:
+                    resource_format = mimetypes.guess_extension(ct.split(";")[0])
+
+                    if resource_format is None:
+                        raise utils.JobError(
+                            "Cannot determine format from mime type. Please specify format."
+                        )
+                    logger.info(f"Inferred file format: {resource_format}")
+                else:
+                    raise utils.JobError(
+                        "Server did not return content-type. Please specify format."
+                    )
+            else:
+                logger.info(f"File format: {resource_format}")
+
+            tmp = os.path.join(temp_dir, "tmp." + resource_format)
+            length = 0
+            # using MD5 for file deduplication only
+            # no need for it to be cryptographically secure
+            m = hashlib.md5()  # DevSkim: ignore DS126858
+
+            # download the file
+            if cl:
+                logger.info(f"Downloading {DataSize(int(cl)):.2MB} file...")
+            else:
+                logger.info("Downloading file of unknown size...")
+
+            with open(tmp, "wb") as tmp_file:
+                for chunk in response.iter_content(conf.CHUNK_SIZE):
+                    length += len(chunk)
+                    if length > max_content_length and not conf.PREVIEW_ROWS:
+                        raise utils.JobError(
+                            f"Resource too large to process: {length} > max ({max_content_length})."
+                        )
+                    tmp_file.write(chunk)
+                    m.update(chunk)
+
+    except requests.HTTPError as e:
+        raise HTTPError(
+            f"DataPusher+ received a bad HTTP response when trying to download "
+            f"the data file from {resource_url}. Status code: {e.response.status_code}, "
+            f"Response content: {e.response.content}",
+            status_code=e.response.status_code,
+            request_url=resource_url,
+            response=e.response.content,
+        )
+    except requests.RequestException as e:
+        raise HTTPError(
+            message=str(e),
+            status_code=None,
+            request_url=resource_url,
+            response=None,
+        )
+
+    file_hash = m.hexdigest()
+    dataset_stats["ORIGINAL_FILE_SIZE"] = length
+
+    # check if the resource metadata (like data dictionary data types)
+    # has been updated since the last fetch
+    resource_updated = False
+    resource_last_modified = resource.get("last_modified")
+    if resource_last_modified:
+        resource_last_modified = parsedate(resource_last_modified)
+        file_last_modified = response.headers.get("last-modified")
+        if file_last_modified:
+            file_last_modified = parsedate(file_last_modified).replace(tzinfo=None)
+            if file_last_modified < resource_last_modified:
+                resource_updated = True
+
+    if (
+        resource.get("hash") == file_hash
+        and not data.get("ignore_hash")
+        and not conf.IGNORE_FILE_HASH
+        and not resource_updated
+    ):
+        logger.warning(f"Upload skipped as the file hash hasn't changed: {file_hash}.")
+        return
+
+    resource["hash"] = file_hash
+
+    fetch_elapsed = time.perf_counter() - timer_start
+    logger.info(
+        f"Fetched {DataSize(length):.2MB} file in {fetch_elapsed:,.2f} seconds."
+    )
+
+    # Check if the file is a zip file
+    unzipped_format = ""
+    if resource_format.upper() == "ZIP":
+        logger.info("Processing ZIP file...")
+
+        file_count, extracted_path, unzipped_format = dph.extract_zip_or_metadata(
+            tmp, temp_dir, logger
+        )
+        if not file_count:
+            logger.error("ZIP file invalid or no files found in ZIP file.")
+            return
+        logger.info(
+            f"More than one file in the ZIP file ({file_count} files), saving metadata..."
+            if file_count > 1
+            else f"Extracted {unzipped_format} file: {extracted_path}"
+        )
+        tmp = extracted_path
+
+    # ===================================================================================
+    # ANALYZE WITH QSV
+    # ===================================================================================
+    # Start Analysis using qsv instead of messytables, as
+    # 1) its type inferences are bullet-proof not guesses as it scans the entire file,
+    # 2) its super-fast, and
+    # 3) it has addl data-wrangling capabilities we use in DP+ (e.g. stats, dedup, etc.)
+    dupe_count = 0
+    record_count = 0
+    analysis_start = time.perf_counter()
+    logger.info("ANALYZING WITH QSV..")
+
+    # flag to check if the file is a spatial format
+    spatial_format_flag = False
+    simplification_failed_flag = False
+    # ----------------- is it a spreadsheet? ---------------
+    # check content type or file extension if its a spreadsheet
+    spreadsheet_extensions = ["XLS", "XLSX", "ODS", "XLSM", "XLSB"]
+    file_format = resource.get("format").upper()
+    if (
+        file_format in spreadsheet_extensions
+        or unzipped_format in spreadsheet_extensions
+    ):
+        # if so, export spreadsheet as a CSV file
+        default_excel_sheet = conf.DEFAULT_EXCEL_SHEET
+        file_format = unzipped_format if unzipped_format != "" else file_format
+        logger.info(f"Converting {file_format} sheet {default_excel_sheet} to CSV...")
+        # first, we need a temporary spreadsheet filename with the right file extension
+        # we only need the filename though, that's why we remove it
+        # and create a hardlink to the file we got from CKAN
+        qsv_spreadsheet = os.path.join(temp_dir, "qsv_spreadsheet." + file_format)
+        os.link(tmp, qsv_spreadsheet)
+
+        # run `qsv excel` and export it to a CSV
+        # use --trim option to trim column names and the data
+        qsv_excel_csv = os.path.join(temp_dir, "qsv_excel.csv")
+        try:
+            qsv_excel = qsv.excel(
+                qsv_spreadsheet,
+                sheet=default_excel_sheet,
+                trim=True,
+                output_file=qsv_excel_csv,
+            )
+        except utils.JobError as e:
+            raise utils.JobError(
+                f"Upload aborted. Cannot export spreadsheet(?) to CSV: {e}"
+            )
+        excel_export_msg = qsv_excel.stderr
+        logger.info(f"{excel_export_msg}...")
+        tmp = qsv_excel_csv
+    elif resource_format.upper() in ["SHP", "QGIS", "GEOJSON"]:
+        logger.info("SHAPEFILE or GEOJSON file detected...")
+
+        qsv_spatial_file = os.path.join(
+            temp_dir,
+            "qsv_spatial_" + str(uuid.uuid4()) + "." + resource_format,
+        )
+        os.link(tmp, qsv_spatial_file)
+        qsv_spatial_csv = os.path.join(temp_dir, "qsv_spatial.csv")
+
+        if conf.AUTO_SPATIAL_SIMPLIFICATION:
+            # Try to convert spatial file to CSV using spatial_helpers
+            logger.info(
+                f"Converting spatial file to CSV with a simplification relative tolerance of {conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE}..."
+            )
+
+            try:
+                # Use the convert_to_csv function from spatial_helpers
+                success, error_message, bounds = sh.process_spatial_file(
+                    qsv_spatial_file,
+                    resource_format,
+                    output_csv_path=qsv_spatial_csv,
+                    tolerance=conf.SPATIAL_SIMPLIFICATION_RELATIVE_TOLERANCE,
+                    task_logger=logger,
+                )
+
+                if success:
+                    logger.info(
+                        "Spatial file successfully simplified and converted to CSV"
+                    )
+                    tmp = qsv_spatial_csv
+
+                    # Check if the simplified resource already exists
+                    simplified_resource_name = (
+                        os.path.splitext(resource["name"])[0]
+                        + "_simplified"
+                        + os.path.splitext(resource["name"])[1]
+                    )
+                    existing_resource, existing_resource_id = dsu.resource_exists(
+                        resource["package_id"], simplified_resource_name
+                    )
+
+                    if existing_resource:
+                        logger.info(
+                            "Simplified resource already exists. Replacing it..."
+                        )
+                        dsu.delete_resource(existing_resource_id)
+                    else:
+                        logger.info(
+                            "Simplified resource does not exist. Uploading it..."
+                        )
+                        new_simplified_resource = {
+                            "package_id": resource["package_id"],
+                            "name": os.path.splitext(resource["name"])[0]
+                            + "_simplified"
+                            + os.path.splitext(resource["name"])[1],
+                            "url": "",
+                            "format": resource["format"],
+                            "hash": "",
+                            "mimetype": resource["mimetype"],
+                            "mimetype_inner": resource["mimetype_inner"],
+                        }
+
+                        # Add bounds information if available
+                        if bounds:
+                            minx, miny, maxx, maxy = bounds
+                            new_simplified_resource.update(
+                                {
+                                    "dpp_spatial_extent": {
+                                        "type": "BoundingBox",
+                                        "coordinates": [
+                                            [minx, miny],
+                                            [maxx, maxy],
+                                        ],
+                                    }
+                                }
+                            )
+                            logger.info(
+                                f"Added dpp_spatial_extent to resource metadata: {bounds}"
+                            )
+
+                        dsu.upload_resource(new_simplified_resource, qsv_spatial_file)
+
+                        # delete the simplified spatial file
+                        os.remove(qsv_spatial_file)
+
+                    simplification_failed_flag = False
+                else:
+                    logger.warning(
+                        f"Upload of simplified spatial file failed: {error_message}"
+                    )
+                    simplification_failed_flag = True
+            except Exception as e:
+                logger.warning(f"Simplification and conversion failed: {str(e)}")
+                logger.warning(
+                    f"Simplification and conversion failed. Using qsv geoconvert to convert to CSV, truncating large columns to {conf.QSV_STATS_STRING_MAX_LENGTH} characters..."
+                )
+                simplification_failed_flag = True
+
+        # If we are not auto-simplifying or simplification failed, use qsv geoconvert
+        if not conf.AUTO_SPATIAL_SIMPLIFICATION or simplification_failed_flag:
+            logger.info("Converting spatial file to CSV using qsv geoconvert...")
+
+            # Run qsv geoconvert
+            qsv_geoconvert_csv = os.path.join(temp_dir, "qsv_geoconvert.csv")
+            try:
+                qsv.geoconvert(
+                    tmp,
+                    resource_format,
+                    "csv",
+                    max_length=conf.QSV_STATS_STRING_MAX_LENGTH,
+                    output_file=qsv_geoconvert_csv,
+                )
+            except utils.JobError as e:
+                raise utils.JobError(f"qsv geoconvert failed: {e}")
+
+            tmp = qsv_geoconvert_csv
+            logger.info("Geoconverted successfully")
+
+    else:
+        # --- its not a spreadsheet nor a spatial format, its a CSV/TSV/TAB file ------
+        # Normalize & transcode to UTF-8 using `qsv input`. We need to normalize as
+        # it could be a CSV/TSV/TAB dialect with differing delimiters, quoting, etc.
+        # Using qsv input's --output option also auto-transcodes to UTF-8.
+        # Note that we only change the workfile, the resource file itself is unchanged.
+
+        # ------------------- Normalize to CSV ---------------------
+        qsv_input_csv = os.path.join(temp_dir, "qsv_input.csv")
+        # if resource_format is CSV we don't need to normalize
+        if resource_format.upper() == "CSV":
+            logger.info(f"Normalizing/UTF-8 transcoding {resource_format}...")
+        else:
+            # if not CSV (e.g. TSV, TAB, etc.) we need to normalize to CSV
+            logger.info(f"Normalizing/UTF-8 transcoding {resource_format} to CSV...")
+
+        qsv_input_utf_8_encoded_csv = os.path.join(
+            temp_dir, "qsv_input_utf_8_encoded.csv"
+        )
+
+        # using uchardet to determine encoding
+        file_encoding = subprocess.run(
+            ["uchardet", tmp],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        logger.info(f"Identified encoding of the file: {file_encoding.stdout}")
+
+        # trim the encoding string
+        file_encoding.stdout = file_encoding.stdout.strip()
+
+        # using iconv to re-encode in UTF-8 OR ASCII (as ASCII is a subset of UTF-8)
+        if file_encoding.stdout != "UTF-8" and file_encoding.stdout != "ASCII":
+            logger.info(
+                f"File is not UTF-8 encoded. Re-encoding from {file_encoding.stdout} to UTF-8"
+            )
+            try:
+                cmd = subprocess.run(
+                    [
+                        "iconv",
+                        "-f",
+                        file_encoding.stdout,
+                        "-t",
+                        "UTF-8",
+                        tmp,
+                    ],
+                    capture_output=True,
+                    check=True,
+                )
+            except subprocess.CalledProcessError as e:
+                raise utils.JobError(
+                    f"Job aborted as the file cannot be re-encoded to UTF-8. {e.stderr}"
+                )
+            f = open(qsv_input_utf_8_encoded_csv, "wb")
+            f.write(cmd.stdout)
+            f.close()
+            logger.info("Successfully re-encoded to UTF-8")
+
+        else:
+            qsv_input_utf_8_encoded_csv = tmp
+        try:
+            qsv.input(tmp, trim_headers=True, output_file=qsv_input_csv)
+        except utils.JobError as e:
+            raise utils.JobError(
+                f"Job aborted as the file cannot be normalized/transcoded: {e}."
+            )
+        tmp = qsv_input_csv
+        logger.info("Normalized & transcoded...")
+
+    # ------------------------------------- Validate CSV --------------------------------------
+    # Run an RFC4180 check with `qsv validate` against the normalized, UTF-8 encoded CSV file.
+    # Even excel exported CSVs can be potentially invalid, as it allows the export of "flexible"
+    # CSVs - i.e. rows may have different column counts.
+    # If it passes validation, we can handle it with confidence downstream as a "normal" CSV.
+    logger.info("Validating CSV...")
+    try:
+        qsv.validate(tmp)
+    except utils.JobError as e:
+        raise utils.JobError(f"qsv validate failed: {e}")
+
+    logger.info("Well-formed, valid CSV file confirmed...")
+
+    # --------------------- Sortcheck --------------------------
+    # if SORT_AND_DUPE_CHECK is True or DEDUP is True
+    # check if the file is sorted and if it has duplicates
+    # get the record count, unsorted breaks and duplicate count as well
+    if conf.SORT_AND_DUPE_CHECK or conf.DEDUP:
+        logger.info("Checking for duplicates and if the CSV is sorted...")
+
+        try:
+            qsv_sortcheck = qsv.sortcheck(tmp, json_output=True, uses_stdio=True)
+        except utils.JobError as e:
+            raise utils.JobError(
+                f"Failed to check if CSV is sorted and has duplicates: {e}"
+            )
+
+        try:
+            # Handle both subprocess.CompletedProcess and dict outputs
+            stdout_content = (
+                qsv_sortcheck.stdout
+                if hasattr(qsv_sortcheck, "stdout")
+                else qsv_sortcheck.get("stdout")
+            )
+            sortcheck_json = json.loads(str(stdout_content))
+        except (json.JSONDecodeError, AttributeError) as e:
+            raise utils.JobError(f"Failed to parse sortcheck JSONoutput: {e}")
+
+        try:
+            # Extract and validate required fields
+            is_sorted = bool(sortcheck_json.get("sorted", False))
+            record_count = int(sortcheck_json.get("record_count", 0))
+            unsorted_breaks = int(sortcheck_json.get("unsorted_breaks", 0))
+            dupe_count = int(sortcheck_json.get("dupe_count", 0))
+            dataset_stats["IS_SORTED"] = is_sorted
+            dataset_stats["RECORD_COUNT"] = record_count
+            dataset_stats["UNSORTED_BREAKS"] = unsorted_breaks
+            dataset_stats["DUPE_COUNT"] = dupe_count
+        except (ValueError, TypeError) as e:
+            raise utils.JobError(f"Invalid numeric value in sortcheck output: {e}")
+
+        # Format the message with clear statistics
+        sortcheck_msg = f"Sorted: {is_sorted}; Unsorted breaks: {unsorted_breaks:,}"
+        if is_sorted and dupe_count > 0:
+            sortcheck_msg = f"{sortcheck_msg}; Duplicates: {dupe_count:,}"
+
+        logger.info(sortcheck_msg)
+
+    # --------------- Do we need to dedup? ------------------
+    if conf.DEDUP and dupe_count > 0:
+        qsv_dedup_csv = os.path.join(temp_dir, "qsv_dedup.csv")
+        logger.info(f"{dupe_count} duplicate rows found. Deduping...")
+
+        try:
+            qsv.extdedup(tmp, qsv_dedup_csv)
+        except utils.JobError as e:
+            raise utils.JobError(f"Check for duplicates error: {e}")
+
+        dataset_stats["DEDUPED"] = True
+        tmp = qsv_dedup_csv
+        logger.info(f"Deduped CSV saved to {qsv_dedup_csv}")
+    else:
+        dataset_stats["DEDUPED"] = False
+
+    # ----------------------- Headers & Safenames ---------------------------
+    # get existing header names, so we can use them for data dictionary labels
+    # should we need to change the column name to make it "db-safe"
+    try:
+        qsv_headers = qsv.headers(tmp, just_names=True)
+    except utils.JobError as e:
+        raise utils.JobError(f"Cannot scan CSV headers: {e}")
+    original_headers = str(qsv_headers.stdout).strip()
+    original_header_dict = {
+        idx: ele for idx, ele in enumerate(original_headers.splitlines())
+    }
+
+    # now, ensure our column/header names identifiers are "safe names"
+    # i.e. valid postgres/CKAN Datastore identifiers
+    qsv_safenames_csv = os.path.join(temp_dir, "qsv_safenames.csv")
+    logger.info('Checking for "database-safe" header names...')
+    try:
+        qsv_safenames = qsv.safenames(
+            tmp,
+            mode="json",
+            reserved=conf.RESERVED_COLNAMES,
+            prefix=conf.UNSAFE_PREFIX,
+            uses_stdio=True,
+        )
+    except utils.JobError as e:
+        raise utils.JobError(f"Cannot scan CSV headers: {e}")
+
+    unsafe_json = json.loads(str(qsv_safenames.stdout))
+    unsafe_headers = unsafe_json["unsafe_headers"]
+
+    if unsafe_headers:
+        logger.info(
+            f'"{len(unsafe_headers)} unsafe" header names found ({unsafe_headers}). Sanitizing..."'
+        )
+        qsv_safenames = qsv.safenames(
+            tmp, mode="conditional", output_file=qsv_safenames_csv
+        )
+        tmp = qsv_safenames_csv
+    else:
+        logger.info("No unsafe header names found...")
+
+    # ---------------------- Type Inferencing -----------------------
+    # at this stage, we have a "clean" CSV ready for Type Inferencing
+
+    # first, index csv for speed - count, stats and slice
+    # are all accelerated/multithreaded when an index is present
+    try:
+        qsv_index_file = tmp + ".idx"
+        qsv.index(tmp)
+    except utils.JobError as e:
+        raise utils.JobError(f"Cannot index CSV: {e}")
+
+    # if SORT_AND_DUPE_CHECK = True, we already know the record count
+    # so we can skip qsv count.
+    if not conf.SORT_AND_DUPE_CHECK:
+        # get record count, this is instantaneous with an index
+        try:
+            qsv_count = qsv.count(tmp)
+            record_count = int(str(qsv_count.stdout).strip())
+            dataset_stats["RECORD_COUNT"] = record_count
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot count records in CSV: {e}")
+
+    # its empty, nothing to do
+    if record_count == 0:
+        logger.warning("Upload skipped as there are zero records.")
+        return
+
+    # log how many records we detected
+    unique_qualifier = ""
+    if conf.DEDUP:
+        unique_qualifier = "unique"
+    logger.info(f"{record_count} {unique_qualifier} records detected...")
+
+    # run qsv stats to get data types and summary statistics
+    logger.info("Inferring data types and compiling statistics...")
+    headers = []
+    types = []
+    headers_min = []
+    headers_max = []
+    headers_cardinality = []
+    qsv_stats_csv = os.path.join(temp_dir, "qsv_stats.csv")
+
+    try:
+        # If the file is a spatial format, we need to use --max-length
+        # to truncate overly long strings from causing issues with
+        # Python's CSV reader and Postgres's limits with the COPY command
+        if spatial_format_flag:
+            env = os.environ.copy()
+            env["QSV_STATS_STRING_MAX_LENGTH"] = str(conf.QSV_STATS_STRING_MAX_LENGTH)
+            qsv_stats = qsv.stats(
+                tmp,
+                infer_dates=True,
+                dates_whitelist=conf.QSV_DATES_WHITELIST,
+                stats_jsonl=True,
+                prefer_dmy=conf.PREFER_DMY,
+                cardinality=bool(conf.AUTO_INDEX_THRESHOLD),
+                summary_stats_options=conf.SUMMARY_STATS_OPTIONS,
+                output_file=qsv_stats_csv,
+                env=env,
+            )
+        else:
+            qsv_stats = qsv.stats(
+                tmp,
+                infer_dates=True,
+                dates_whitelist=conf.QSV_DATES_WHITELIST,
+                stats_jsonl=True,
+                prefer_dmy=conf.PREFER_DMY,
+                cardinality=bool(conf.AUTO_INDEX_THRESHOLD),
+                summary_stats_options=conf.SUMMARY_STATS_OPTIONS,
+                output_file=qsv_stats_csv,
+            )
+    except utils.JobError as e:
+        raise utils.JobError(f"Cannot infer data types and compile statistics: {e}")
+
+    # Dictionary to look up stats by resource field name
+    resource_fields_stats = {}
+
+    with open(qsv_stats_csv, mode="r") as inp:
+        reader = csv.DictReader(inp)
+        for row in reader:
+            # Add to stats dictionary with resource field name as key
+            resource_fields_stats[row["field"]] = {"stats": row}
+
+            fr = {k: v for k, v in row.items()}
+            schema_field = fr.get("field", "Unnamed Column")
+            if schema_field.startswith("qsv_"):
+                break
+            headers.append(schema_field)
+            types.append(fr.get("type", "String"))
+            headers_min.append(fr["min"])
+            headers_max.append(fr["max"])
+            if conf.AUTO_INDEX_THRESHOLD:
+                headers_cardinality.append(int(fr.get("cardinality") or 0))
+
+    # Get the field stats for each field in the headers list
+    existing = dsu.datastore_resource_exists(resource_id)
+    existing_info = None
+    if existing:
+        existing_info = dict(
+            (f["id"], f["info"]) for f in existing.get("fields", []) if "info" in f
+        )
+
+    # if this is an existing resource
+    # override with types user requested in Data Dictionary
+    if existing_info:
+        types = [
+            {
+                "text": "String",
+                "numeric": "Float",
+                "timestamp": "DateTime",
+            }.get(existing_info.get(h, {}).get("type_override"), t)
+            for t, h in zip(types, headers)
+        ]
+
+    # Delete existing datastore resource before proceeding.
+    if existing:
+        logger.info(f'Deleting existing resource "{resource_id}" from datastore.')
+        dsu.delete_datastore_resource(resource_id)
+
+    # 1st pass of building headers_dict
+    # here we map inferred types to postgresql data types
+    default_type = "String"
+    temp_headers_dicts = [
+        dict(
+            id=field[0],
+            type=conf.TYPE_MAPPING.get(
+                str(field[1]) if field[1] else default_type, "text"
+            ),
+        )
+        for field in zip(headers, types)
+    ]
+
+    # 2nd pass header_dicts, checking for smartint types.
+    # "smartint" will automatically select the best integer data type based on the
+    # min/max values of the column we got from qsv stats.
+    # We also set the Data Dictionary Label to original column names in case we made
+    # the names "db-safe" as the labels are used by DataTables_view to label columns
+    # we also take note of datetime/timestamp fields, so we can normalize them
+    # to RFC3339 format, which is Postgres COPY ready
+    datetimecols_list = []
+    headers_dicts = []
+    for idx, header in enumerate(temp_headers_dicts):
+        if header["type"] == "smartint":
+            if (
+                int(headers_max[idx]) <= conf.POSTGRES_INT_MAX
+                and int(headers_min[idx]) >= conf.POSTGRES_INT_MIN
+            ):
+                header_type = "integer"
+            elif (
+                int(headers_max[idx]) <= conf.POSTGRES_BIGINT_MAX
+                and int(headers_min[idx]) >= conf.POSTGRES_BIGINT_MIN
+            ):
+                header_type = "bigint"
+            else:
+                header_type = "numeric"
+        else:
+            header_type = header["type"]
+        if header_type == "timestamp":
+            datetimecols_list.append(header["id"])
+        info_dict = dict(label=original_header_dict.get(idx, "Unnamed Column"))
+        headers_dicts.append(dict(id=header["id"], type=header_type, info=info_dict))
+
+    # Maintain data dictionaries from matching column names
+    # if data dictionary already exists for this resource as
+    # we want to preserve the user's data dictionary curations
+    if existing_info:
+        for h in headers_dicts:
+            if h["id"] in existing_info:
+                h["info"] = existing_info[h["id"]]
+                # create columns with types user requested
+                type_override = existing_info[h["id"]].get("type_override")
+                if type_override in list(conf.TYPE_MAPPING.values()):
+                    h["type"] = type_override
+
+    logger.info(f"Determined headers and types: {headers_dicts}...")
+
+    # ----------------------- Frequency Table ---------------------------
+    # compile a frequency table for each column
+    qsv_freq_csv = os.path.join(temp_dir, "qsv_freq.csv")
+
+    try:
+        qsv.frequency(tmp, limit=conf.QSV_FREQ_LIMIT, output_file=qsv_freq_csv)
+    except utils.JobError as e:
+        raise utils.JobError(f"Cannot create a frequency table: {e}")
+
+    resource_fields_freqs = {}
+    try:
+        with open(qsv_freq_csv, "r") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                field = row["field"]
+                value = row["value"]
+                count = row["count"]
+                percentage = row["percentage"]
+
+                # Initialize list for field if it doesn't exist
+                if field not in resource_fields_freqs:
+                    resource_fields_freqs[field] = []
+
+                # Append the frequency data as a dict to the field's list
+                resource_fields_freqs[field].append(
+                    {
+                        "value": value,
+                        "count": count,
+                        "percentage": percentage,
+                    }
+                )
+
+            logger.trace(f"Resource fields freqs: {resource_fields_freqs}")
+
+    except IOError as e:
+        raise utils.JobError("Could not open frequency CSV file: {}".format(e))
+
+    # ------------------- Do we need to create a Preview?  -----------------------
+    # if conf.PREVIEW_ROWS is not zero, create a preview using qsv slice
+    # we do the rows_to_copy > conf.PREVIEW_ROWS to check if we don't need to slice
+    # the CSV anymore if we only did a partial download of N conf.PREVIEW_ROWS already
+    rows_to_copy = record_count
+    if conf.PREVIEW_ROWS and record_count > conf.PREVIEW_ROWS:
+        if conf.PREVIEW_ROWS > 0:
+            # conf.PREVIEW_ROWS is positive, slice from the beginning
+            logger.info(f"Preparing {conf.PREVIEW_ROWS}-row preview...")
+            qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv")
+            try:
+                qsv.slice(tmp, length=conf.PREVIEW_ROWS, output_file=qsv_slice_csv)
+            except utils.JobError as e:
+                raise utils.JobError(f"Cannot create a preview slice: {e}")
+            rows_to_copy = conf.PREVIEW_ROWS
+            tmp = qsv_slice_csv
+        else:
+            # conf.PREVIEW_ROWS is negative, slice from the end
+            # TODO: do http range request so we don't have to download the whole file
+            # to slice from the end
+            slice_len = abs(conf.PREVIEW_ROWS)
+            logger.info(f"Preparing {slice_len}-row preview from the end...")
+            qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv")
+            try:
+                qsv.slice(tmp, start=-1, length=slice_len, output_file=qsv_slice_csv)
+            except utils.JobError as e:
+                raise utils.JobError(f"Cannot create a preview slice from the end: {e}")
+            rows_to_copy = slice_len
+            tmp = qsv_slice_csv
+
+        dataset_stats["PREVIEW_FILE_SIZE"] = os.path.getsize(tmp)
+        dataset_stats["PREVIEW_RECORD_COUNT"] = rows_to_copy
+
+    # ---------------- Normalize dates to RFC3339 format --------------------
+    # if there are any datetime fields, normalize them to RFC3339 format
+    # so we can readily insert them as timestamps into postgresql with COPY
+    if datetimecols_list:
+        qsv_applydp_csv = os.path.join(temp_dir, "qsv_applydp.csv")
+        datecols = ",".join(datetimecols_list)
+
+        logger.info(
+            f'Formatting dates "{datecols}" to ISO 8601/RFC 3339 format with PREFER_DMY: {conf.PREFER_DMY}...'
+        )
+        try:
+            qsv.datefmt(
+                datecols,
+                tmp,
+                prefer_dmy=conf.PREFER_DMY,
+                output_file=qsv_applydp_csv,
+            )
+        except utils.JobError as e:
+            raise utils.JobError(f"Applydp error: {e}")
+        tmp = qsv_applydp_csv
+
+    # -------------------- QSV ANALYSIS DONE --------------------
+    analysis_elapsed = time.perf_counter() - analysis_start
+    logger.info(
+        f"ANALYSIS DONE! Analyzed and prepped in {analysis_elapsed:,.2f} seconds."
+    )
+
+    # ----------------------------- PII Screening ------------------------------
+    # we scan for Personally Identifiable Information (PII) using qsv's powerful
+    # searchset command which can SIMULTANEOUSLY compare several regexes per
+    # field in one pass
+    piiscreening_start = 0
+    piiscreening_elapsed = 0
+    pii_found = False
+
+    if conf.PII_SCREENING:
+        piiscreening_start = time.perf_counter()
+        pii_found = screen_for_pii(tmp, resource, qsv, temp_dir, logger)
+        piiscreening_elapsed = time.perf_counter() - piiscreening_start
+
+    dataset_stats["PII_SCREENING"] = conf.PII_SCREENING
+    dataset_stats["PII_FOUND"] = pii_found
+
+    # delete the qsv index file manually
+    # as it was created by qsv index, and not by tempfile
+    os.remove(qsv_index_file)
+
+    # at this stage, the resource is ready for COPYing to the Datastore
+
+    if dry_run:
+        logger.warning("Dry run only. Returning without copying to the Datastore...")
+        return headers_dicts
+
+    # ============================================================
+    # COPY to Datastore
+    # ============================================================
+    copy_start = time.perf_counter()
+
+    if conf.PREVIEW_ROWS:
+        logger.info(f"COPYING {rows_to_copy}-row preview to Datastore...")
+    else:
+        logger.info(f"COPYING {rows_to_copy} rows to Datastore...")
+
+    # first, let's create an empty datastore table w/ guessed types
+    dsu.send_resource_to_datastore(
+        resource=None,
+        resource_id=resource["id"],
+        headers=headers_dicts,
+        records=None,
+        aliases=None,
+        calculate_record_count=False,
+    )
+
+    copied_count = 0
+    try:
+        raw_connection = psycopg2.connect(conf.DATASTORE_WRITE_URL)
+    except psycopg2.Error as e:
+        raise utils.JobError(f"Could not connect to the Datastore: {e}")
+    else:
+        cur = raw_connection.cursor()
+
+        # truncate table to use copy freeze option and further increase
+        # performance as there is no need for WAL logs to be maintained
+        # https://www.postgresql.org/docs/current/populate.html#POPULATE-COPY-FROM
+        try:
+            cur.execute(
+                sql.SQL("TRUNCATE TABLE {}").format(sql.Identifier(resource_id))
+            )
+
+        except psycopg2.Error as e:
+            logger.warning(f"Could not TRUNCATE: {e}")
+
+        col_names_list = [h["id"] for h in headers_dicts]
+        column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list)
+        copy_sql = sql.SQL(
+            "COPY {} ({}) FROM STDIN "
+            "WITH (FORMAT CSV, FREEZE 1, "
+            "HEADER 1, ENCODING 'UTF8');"
+        ).format(
+            sql.Identifier(resource_id),
+            column_names,
+        )
+        # specify a 1MB buffer size for COPY read from disk
+        with open(tmp, "rb", conf.COPY_READBUFFER_SIZE) as f:
+            try:
+                cur.copy_expert(copy_sql, f, size=conf.COPY_READBUFFER_SIZE)
+            except psycopg2.Error as e:
+                raise utils.JobError(f"Postgres COPY failed: {e}")
+            else:
+                copied_count = cur.rowcount
+
+        raw_connection.commit()
+        # this is needed to issue a VACUUM ANALYZE
+        raw_connection.set_isolation_level(
+            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT
+        )
+        analyze_cur = raw_connection.cursor()
+        analyze_cur.execute(
+            sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id))
+        )
+        analyze_cur.close()
+
+    copy_elapsed = time.perf_counter() - copy_start
+    logger.info(
+        f'...copying done. Copied {copied_count} rows to "{resource_id}" in {copy_elapsed:,.2f} seconds.'
+    )
+
+    # =================================================================================================
+    # INDEXING
+    # =================================================================================================
+    # if AUTO_INDEX_THRESHOLD > 0 or AUTO_INDEX_DATES is true
+    # create indices automatically based on summary statistics
+    # For columns w/ cardinality = record_count, it's all unique values, create a unique index
+    # If AUTO_INDEX_DATES is true, index all date columns
+    # if a column's cardinality <= AUTO_INDEX_THRESHOLD, create an index for that column
+    if (
+        conf.AUTO_INDEX_THRESHOLD
+        or (conf.AUTO_INDEX_DATES and datetimecols_list)
+        or conf.AUTO_UNIQUE_INDEX
+    ):
+        index_start = time.perf_counter()
+        logger.info(
+            f"AUTO-INDEXING. Auto-index threshold: {conf.AUTO_INDEX_THRESHOLD} unique value/s. Auto-unique index: {conf.AUTO_UNIQUE_INDEX} Auto-index dates: {conf.AUTO_INDEX_DATES} ..."
+        )
+        index_cur = raw_connection.cursor()
+
+        # if auto_index_threshold == -1
+        # we index all the columns
+        if conf.AUTO_INDEX_THRESHOLD == -1:
+            conf.AUTO_INDEX_THRESHOLD = record_count
+
+        index_count = 0
+        for idx, cardinality in enumerate(headers_cardinality):
+            curr_col = headers[idx]
+            if (
+                conf.AUTO_INDEX_THRESHOLD > 0
+                or conf.AUTO_INDEX_DATES
+                or conf.AUTO_UNIQUE_INDEX
+            ):
+                if cardinality == record_count and conf.AUTO_UNIQUE_INDEX:
+                    # all the values are unique for this column, create a unique index
+                    if conf.PREVIEW_ROWS > 0:
+                        unique_value_count = min(conf.PREVIEW_ROWS, cardinality)
+                    else:
+                        unique_value_count = cardinality
+                    logger.info(
+                        f'Creating UNIQUE index on "{curr_col}" for {unique_value_count} unique values...'
+                    )
+                    try:
+                        index_cur.execute(
+                            sql.SQL("CREATE UNIQUE INDEX ON {} ({})").format(
+                                sql.Identifier(resource_id),
+                                sql.Identifier(curr_col),
+                            )
+                        )
+                    except psycopg2.Error as e:
+                        logger.warning(
+                            f'Could not CREATE UNIQUE INDEX on "{curr_col}": {e}'
+                        )
+                    index_count += 1
+                elif cardinality <= conf.AUTO_INDEX_THRESHOLD or (
+                    conf.AUTO_INDEX_DATES and (curr_col in datetimecols_list)
+                ):
+                    # cardinality <= auto_index_threshold or its a date and auto_index_date is true
+                    # create an index
+                    if curr_col in datetimecols_list:
+                        logger.info(
+                            f'Creating index on "{curr_col}" date column for {cardinality} unique value/s...'
+                        )
+                    else:
+                        logger.info(
+                            f'Creating index on "{curr_col}" for {cardinality} unique value/s...'
+                        )
+                    try:
+                        index_cur.execute(
+                            sql.SQL("CREATE INDEX ON {} ({})").format(
+                                sql.Identifier(resource_id),
+                                sql.Identifier(curr_col),
+                            )
+                        )
+                    except psycopg2.Error as e:
+                        logger.warning(f'Could not CREATE INDEX on "{curr_col}": {e}')
+                    index_count += 1
+
+        index_cur.close()
+        raw_connection.commit()
+
+        logger.info("Vacuum Analyzing table to optimize indices...")
+
+        # this is needed to issue a VACUUM ANALYZE
+        raw_connection.set_isolation_level(
+            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT
+        )
+        analyze_cur = raw_connection.cursor()
+        analyze_cur.execute(
+            sql.SQL("VACUUM ANALYZE {}").format(sql.Identifier(resource_id))
+        )
+        analyze_cur.close()
+
+        index_elapsed = time.perf_counter() - index_start
+        logger.info(
+            f'...indexing/vacuum analysis done. Indexed {index_count} column/s in "{resource_id}" in {index_elapsed:,.2f} seconds.'
+        )
+
+    # ============================================================
+    # PROCESS DRUF JINJA2 FORMULAE
+    # ============================================================
+    # Check if there are any fields with DRUF keys in the scheming_yaml
+    # There are two types of DRUF keys:
+    # 1. "formula": This is used to update the field value DIRECTLY
+    #    when the resource is created/updated. It can update both package and resource fields.
+    # 2. "suggestion_formula": This is used to populate the suggestion
+    #    popovers DURING data entry/curation.
+    # DRUF keys are stored as jinja2 template expressions in the scheming_yaml
+    # and are rendered using the Jinja2 template engine.
+    formulae_start = time.perf_counter()
+
+    # Fetch the scheming_yaml and package
+    package_id = resource["package_id"]
+    scheming_yaml, package = dsu.get_scheming_yaml(
+        package_id, scheming_yaml_type="dataset"
+    )
+
+    # Check for suggestion_formula in dataset_fields
+    has_suggestion_formula = any(
+        isinstance(field, dict)
+        and any(key.startswith("suggestion_formula") for key in field.keys())
+        for field in scheming_yaml["dataset_fields"]
+    )
+
+    if has_suggestion_formula:
+
+        logger.info(
+            'Found suggestion formulae in schema'
+        )
+
+        # Check for "dpp_suggestions" in scheming_yaml
+        schema_has_dpp_suggestions = any(
+            isinstance(field, dict)
+            and field.get("field_name") == "dpp_suggestions"
+            for field in scheming_yaml["dataset_fields"]
+        )
+        if not schema_has_dpp_suggestions:
+            logger.error(
+                '"dpp_suggestions" field required but not found in your schema. Ensure that your scheming.yaml file contains the "dpp_suggestions" field as a json_object.'
+            )
+            return
+        else:
+            logger.info(
+                'Found "dpp_suggestions" field in schema'
+            )
+
+        # add "dpp_suggestions" to package if it does not exist
+        if "dpp_suggestions" not in package:
+
+            logger.warning(
+                'Warning: "dpp_suggestions" field required to process Suggestion Formulae is not found in this package. Adding "dpp_suggestions" to package'
+            )
+
+            try:
+                package["dpp_suggestions"] = {}
+                dsu.patch_package(package)
+                logger.warning(
+                    '"dpp_suggestions" field added to package'
+                )
+                
+            except Exception as e:
+                logger.error(
+                    f'Error adding "dpp_suggestions" field {e}'
+                )
+                return
+    else:
+        logger.info(
+            'No suggestion formulae found'
+        )
+
+    logger.trace(f"package: {package}")
+
+    # FIRST, INITIALIZE THE FORMULA PROCESSOR
+    formula_processor = j2h.FormulaProcessor(
+        scheming_yaml,
+        package,
+        resource,
+        resource_fields_stats,
+        resource_fields_freqs,
+        dataset_stats,
+        logger,
+    )
+
+    package.setdefault("dpp_suggestions", {})[
+        "STATUS"
+    ] = "STARTING FORMULAE PROCESSING..."
+    dsu.patch_package(package)
+
+    # Clear all lru_cache before processing formulae
+    dsu.datastore_search.cache_clear()
+    dsu.datastore_search_sql.cache_clear()
+    dsu.datastore_info.cache_clear()
+    dsu.index_exists.cache_clear()
+
+    # SECOND, WE PROCESS THE FORMULAE THAT UPDATE THE
+    # PACKAGE AND RESOURCE FIELDS DIRECTLY
+    # using the package_patch CKAN API so we only update the fields
+    # with formulae
+    package_updates = formula_processor.process_formulae(
+        "package", "dataset_fields", "formula"
+    )
+    if package_updates:
+        # Update package with formula results
+        package.update(package_updates)
+        status_msg = "PACKAGE formulae processed..."
+        package["dpp_suggestions"]["STATUS"] = status_msg
+        try:
+            patched_package = dsu.patch_package(package)
+            logger.debug(f"Package after patching: {patched_package}")
+            package = patched_package
+            logger.info(status_msg)
+        except Exception as e:
+            logger.error(f"Error patching package: {str(e)}")
+
+    # Process resource formulae
+    # as this is a direct update, we update the resource dictionary directly
+    resource_updates = formula_processor.process_formulae(
+        "resource", "resource_fields", "formula"
+    )
+    if resource_updates:
+        # Update resource with formula results
+        resource.update(resource_updates)
+        status_msg = "RESOURCE formulae processed..."
+        if resource.get("dpp_suggestions"):
+            resource["dpp_suggestions"]["STATUS"] = status_msg
+        else:
+            resource["dpp_suggestions"] = {"STATUS": status_msg}
+        logger.info(status_msg)
+
+    # THIRD, WE PROCESS THE SUGGESTIONS THAT SHOW UP IN THE SUGGESTION POPOVER
+    # we update the package dpp_suggestions field
+    # from which the Suggestion popover UI will pick it up
+    package_suggestions = formula_processor.process_formulae(
+        "package", "dataset_fields", "suggestion_formula"
+    )
+    if package_suggestions:
+        logger.trace(f"package_suggestions: {package_suggestions}")
+        revise_update_content = {"package": package_suggestions}
+        try:
+            status_msg = "PACKAGE suggestion formulae processed..."
+            revise_update_content["STATUS"] = status_msg
+            revised_package = dsu.revise_package(
+                package_id, update={"dpp_suggestions": revise_update_content}
+            )
+            logger.trace(f"Package after revising: {revised_package}")
+            package = revised_package
+            logger.info(status_msg)
+        except Exception as e:
+            logger.error(f"Error revising package: {str(e)}")
+
+    # Process resource suggestion formulae
+    # Note how we still update the PACKAGE dpp_suggestions field
+    # and there is NO RESOURCE dpp_suggestions field.
+    # This is because suggestion formulae are used to populate the
+    # suggestion popover DURING data entry/curation and suggestion formulae
+    # may update both package and resource fields.
+    resource_suggestions = formula_processor.process_formulae(
+        "resource", "resource_fields", "suggestion_formula"
+    )
+    if resource_suggestions:
+        logger.trace(f"resource_suggestions: {resource_suggestions}")
+        resource_name = resource["name"]
+        revise_update_content = {"resource": {resource_name: resource_suggestions}}
+
+        # Handle existing suggestions
+        if package.get("dpp_suggestions"):
+            package["dpp_suggestions"].update(revise_update_content["resource"])
+        else:
+            package["dpp_suggestions"] = revise_update_content["resource"]
+
+        try:
+            status_msg = "RESOURCE suggestion formulae processed..."
+            revise_update_content["STATUS"] = status_msg
+
+            revised_package = dsu.revise_package(
+                package_id, update={"dpp_suggestions": revise_update_content}
+            )
+            logger.trace(f"Package after revising: {revised_package}")
+            package = revised_package
+            logger.info(status_msg)
+        except Exception as e:
+            logger.error(f"Error revising package: {str(e)}")
+
+    # -------------------- FORMULAE PROCESSING DONE --------------------
+    formulae_elapsed = time.perf_counter() - formulae_start
+    logger.info(
+        f"FORMULAE PROCESSING DONE! Processed in {formulae_elapsed:,.2f} seconds."
+    )
+
+    # ============================================================
+    # UPDATE RESOURCE METADATA
+    # ============================================================
+    metadata_start = time.perf_counter()
+    logger.info("UPDATING RESOURCE METADATA...")
+
+    # --------------------- AUTO-ALIASING ------------------------
+    # aliases are human-readable, and make it easier to use than resource id hash
+    # when using the Datastore API and in SQL queries
+    alias = None
+    if conf.AUTO_ALIAS:
+        logger.info(f"AUTO-ALIASING. Auto-alias-unique: {conf.AUTO_ALIAS_UNIQUE} ...")
+        # get package info, so we can construct the alias
+        package = dsu.get_package(resource["package_id"])
+
+        resource_name = resource.get("name")
+        package_name = package.get("name")
+        owner_org = package.get("organization")
+        owner_org_name = ""
+        if owner_org:
+            owner_org_name = owner_org.get("name")
+        if resource_name and package_name and owner_org_name:
+            # we limit it to 55, so we still have space for sequence & stats suffix
+            # postgres max identifier length is 63
+            alias = f"{resource_name}-{package_name}-{owner_org_name}"[:55]
+            # if AUTO_ALIAS_UNIQUE is true, check if the alias already exist, if it does
+            # add a sequence suffix so the new alias can be created
+            cur.execute(
+                "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of",
+                (alias + "%",),
+            )
+            alias_query_result = cur.fetchone()
+            if alias_query_result:
+                alias_count = alias_query_result[0]
+                existing_alias_of = alias_query_result[1]
+            else:
+                alias_count = 0
+                existing_alias_of = ""
+            if conf.AUTO_ALIAS_UNIQUE and alias_count > 1:
+                alias_sequence = alias_count + 1
+                while True:
+                    # we do this, so we're certain the new alias does not exist
+                    # just in case they deleted an older alias with a lower sequence #
+                    alias = f"{alias}-{alias_sequence:03}"
+                    cur.execute(
+                        "SELECT COUNT(*), alias_of FROM _table_metadata where name like %s group by alias_of;",
+                        (alias + "%",),
+                    )
+                    alias_exists = cur.fetchone()[0]
+                    if not alias_exists:
+                        break
+                    alias_sequence += 1
+            elif alias_count == 1:
+                logger.warning(
+                    f'Dropping existing alias "{alias}" for resource "{existing_alias_of}"...'
+                )
+                try:
+                    cur.execute(
+                        sql.SQL("DROP VIEW IF EXISTS {}").format(sql.Identifier(alias))
+                    )
+                except psycopg2.Error as e:
+                    logger.warning(f"Could not drop alias/view: {e}")
+
+        else:
+            logger.warning(
+                f"Cannot create alias: {resource_name}-{package_name}-{owner_org}"
+            )
+            alias = None
+
+    # -------- should we ADD_SUMMARY_STATS_RESOURCE? -------------
+    # by default, we only add summary stats if we're not doing a partial download
+    # (otherwise, you're summarizing the preview, not the whole file)
+    # That is, unless SUMMARY_STATS_WITH_PREVIEW is set to true
+    if conf.ADD_SUMMARY_STATS_RESOURCE or conf.SUMMARY_STATS_WITH_PREVIEW:
+        stats_resource_id = resource_id + "-stats"
+
+        # check if the stats already exist
+        existing_stats = dsu.datastore_resource_exists(stats_resource_id)
+        # Delete existing summary-stats before proceeding.
+        if existing_stats:
+            logger.info(f'Deleting existing summary stats "{stats_resource_id}".')
+
+            cur.execute(
+                "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;",
+                (stats_resource_id + "%",),
+            )
+            stats_alias_result = cur.fetchone()
+            if stats_alias_result:
+                existing_stats_alias_of = stats_alias_result[0]
+
+                dsu.delete_datastore_resource(existing_stats_alias_of)
+                dsu.delete_resource(existing_stats_alias_of)
+
+        stats_aliases = [stats_resource_id]
+        if conf.AUTO_ALIAS:
+            auto_alias_stats_id = alias + "-stats"
+            stats_aliases.append(auto_alias_stats_id)
+
+            # check if the summary-stats alias already exist. We need to do this as summary-stats resources
+            # may end up having the same alias if AUTO_ALIAS_UNIQUE is False, so we need to drop the
+            # existing summary stats-alias.
+            existing_alias_stats = dsu.datastore_resource_exists(auto_alias_stats_id)
+            # Delete existing auto-aliased summary-stats before proceeding.
+            if existing_alias_stats:
+                logger.info(
+                    f'Deleting existing alias summary stats "{auto_alias_stats_id}".'
+                )
+
+                cur.execute(
+                    "SELECT alias_of FROM _table_metadata where name like %s group by alias_of;",
+                    (auto_alias_stats_id + "%",),
+                )
+                result = cur.fetchone()
+                if result:
+                    existing_stats_alias_of = result[0]
+
+                    dsu.delete_datastore_resource(existing_stats_alias_of)
+                    dsu.delete_resource(existing_stats_alias_of)
+
+        # run stats on stats CSV to get header names and infer data types
+        # we don't need summary statistics, so use the --typesonly option
+        try:
+            qsv_stats_stats = qsv.stats(
+                qsv_stats_csv,
+                typesonly=True,
+            )
+        except utils.JobError as e:
+            raise utils.JobError(f"Cannot run stats on CSV stats: {e}")
+
+        stats_stats = str(qsv_stats_stats.stdout).strip()
+        stats_stats_dict = [
+            dict(id=ele.split(",")[0], type=conf.TYPE_MAPPING[ele.split(",")[1]])
+            for idx, ele in enumerate(stats_stats.splitlines()[1:], 1)
+        ]
+
+        logger.info(f"stats_stats_dict: {stats_stats_dict}")
+
+        resource_name = resource.get("name")
+        stats_resource = {
+            "package_id": resource["package_id"],
+            "name": resource_name + " - Summary Statistics",
+            "format": "CSV",
+            "mimetype": "text/csv",
+        }
+        stats_response = dsu.send_resource_to_datastore(
+            stats_resource,
+            resource_id=None,
+            headers=stats_stats_dict,
+            records=None,
+            aliases=stats_aliases,
+            calculate_record_count=False,
+        )
+
+        logger.info(f"stats_response: {stats_response}")
+
+        new_stats_resource_id = stats_response["result"]["resource_id"]
+
+        # now COPY the stats to the datastore
+        col_names_list = [h["id"] for h in stats_stats_dict]
+        logger.info(
+            f'ADDING SUMMARY STATISTICS {col_names_list} in "{new_stats_resource_id}" with alias/es "{stats_aliases}"...'
+        )
+
+        column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list)
+
+        copy_sql = sql.SQL(
+            "COPY {} ({}) FROM STDIN "
+            "WITH (FORMAT CSV, "
+            "HEADER 1, ENCODING 'UTF8');"
+        ).format(
+            sql.Identifier(new_stats_resource_id),
+            column_names,
+        )
+
+        with open(qsv_stats_csv, "rb") as f:
+            try:
+                cur.copy_expert(copy_sql, f)
+            except psycopg2.Error as e:
+                raise utils.JobError(f"Postgres COPY failed: {e}")
+
+        stats_resource["id"] = new_stats_resource_id
+        stats_resource["summary_statistics"] = True
+        stats_resource["summary_of_resource"] = resource_id
+        dsu.update_resource(stats_resource)
+
+    cur.close()
+    raw_connection.commit()
+    raw_connection.close()
+
+    resource["datastore_active"] = True
+    resource["total_record_count"] = record_count
+    if conf.PREVIEW_ROWS < record_count or (conf.PREVIEW_ROWS > 0):
+        resource["preview"] = True
+        resource["preview_rows"] = copied_count
+    else:
+        resource["preview"] = False
+        resource["preview_rows"] = None
+        resource["partial_download"] = False
+    dsu.update_resource(resource)
+
+    # tell CKAN to calculate_record_count and set alias if set
+    dsu.send_resource_to_datastore(
+        resource=None,
+        resource_id=resource["id"],
+        headers=headers_dicts,
+        records=None,
+        aliases=alias,
+        calculate_record_count=True,
+    )
+
+    if alias:
+        logger.info(f'Created alias "{alias}" for "{resource_id}"...')
+
+    metadata_elapsed = time.perf_counter() - metadata_start
+    logger.info(
+        f"RESOURCE METADATA UPDATES DONE! Resource metadata updated in {metadata_elapsed:,.2f} seconds."
+    )
+
+    # -------------------- DONE --------------------
+    package.setdefault("dpp_suggestions", {})["STATUS"] = "DONE"
+    dsu.patch_package(package)
+
+    total_elapsed = time.perf_counter() - timer_start
+    newline_var = "\n"
+    end_msg = f"""
+    DATAPUSHER+ JOB DONE!
+      Download: {fetch_elapsed:,.2f}
+      Analysis: {analysis_elapsed:,.2f}{(newline_var + f"  PII Screening: {piiscreening_elapsed:,.2f}") if piiscreening_elapsed > 0 else ""}
+      COPYing: {copy_elapsed:,.2f}
+      Indexing: {index_elapsed:,.2f}
+      Formulae processing: {formulae_elapsed:,.2f}
+      Resource metadata updates: {metadata_elapsed:,.2f}
+    TOTAL ELAPSED TIME: {total_elapsed:,.2f}
+    """
+    logger.info(end_msg)