From 227169e74d8f6a84a1f26e17ecaf792bdf846bfb Mon Sep 17 00:00:00 2001 From: avdata99 Date: Wed, 29 Oct 2025 11:20:12 -0300 Subject: [PATCH 1/3] fix freq table --- ckanext/datapusher_plus/jobs.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/ckanext/datapusher_plus/jobs.py b/ckanext/datapusher_plus/jobs.py index 6e3f7831..e49f8719 100644 --- a/ckanext/datapusher_plus/jobs.py +++ b/ckanext/datapusher_plus/jobs.py @@ -1174,6 +1174,22 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None): if conf.AUTO_INDEX_THRESHOLD: headers_cardinality.append(int(fr.get("cardinality") or 0)) + # Go through the qsv_stats_csv file and ensure the "mean" field is empty for + # field of type "Date" + new_qsv_stats_csv = os.path.join(temp_dir, "qsv_stats_cleaned.csv") + with open(qsv_stats_csv, mode="r") as inp, open(new_qsv_stats_csv, mode="w") as outp: + reader = csv.DictReader(inp) + fieldnames = reader.fieldnames + writer = csv.DictWriter(outp, fieldnames=fieldnames) + writer.writeheader() + for row in reader: + if row["type"] == "Date" or row["type"] == "DateTime": + row["mean"] = 0 + writer.writerow(row) + qsv_stats_csv = new_qsv_stats_csv + logger.info(f"New qsv_stats_csv types for {qsv_stats_csv}") + print(open(qsv_stats_csv).read()) + # Get the field stats for each field in the headers list existing = datastore_resource_exists(resource_id) existing_info = None @@ -1342,11 +1358,9 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None): qsv_freq_cmd = [ conf.QSV_BIN, "frequency", - "--limit", - "0", + "--limit", "0", tmp, - "--output", - qsv_freq_csv, + "--output", qsv_freq_csv, ] try: qsv_freq = subprocess.run(qsv_freq_cmd, check=True) @@ -1366,10 +1380,15 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None): value TEXT, count INTEGER, percentage FLOAT, + extra TEXT, PRIMARY KEY (field, value, count) ) """ ).format(freq_table, freq_table) + # Could not copy frequency data to database: + # extra data after last expected column + # CONTEXT: COPY 737a3aad-c1c8-4507-8411-2dc6ca176f84-druf-freq, + # line 2: "country,Afghanistan,6,2.52101,1" ) # Copy frequency CSV to /tmp directory for debugging purposes From bd1d1ed3f1a221371f17067a40fade3841c590ca Mon Sep 17 00:00:00 2001 From: avdata99 Date: Wed, 29 Oct 2025 11:21:25 -0300 Subject: [PATCH 2/3] undo --- ckanext/datapusher_plus/jobs.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/ckanext/datapusher_plus/jobs.py b/ckanext/datapusher_plus/jobs.py index e49f8719..9bd9f57c 100644 --- a/ckanext/datapusher_plus/jobs.py +++ b/ckanext/datapusher_plus/jobs.py @@ -1174,22 +1174,6 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None): if conf.AUTO_INDEX_THRESHOLD: headers_cardinality.append(int(fr.get("cardinality") or 0)) - # Go through the qsv_stats_csv file and ensure the "mean" field is empty for - # field of type "Date" - new_qsv_stats_csv = os.path.join(temp_dir, "qsv_stats_cleaned.csv") - with open(qsv_stats_csv, mode="r") as inp, open(new_qsv_stats_csv, mode="w") as outp: - reader = csv.DictReader(inp) - fieldnames = reader.fieldnames - writer = csv.DictWriter(outp, fieldnames=fieldnames) - writer.writeheader() - for row in reader: - if row["type"] == "Date" or row["type"] == "DateTime": - row["mean"] = 0 - writer.writerow(row) - qsv_stats_csv = new_qsv_stats_csv - logger.info(f"New qsv_stats_csv types for {qsv_stats_csv}") - print(open(qsv_stats_csv).read()) - # Get the field stats for each field in the headers list existing = datastore_resource_exists(resource_id) existing_info = None @@ -1358,9 +1342,11 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None): qsv_freq_cmd = [ conf.QSV_BIN, "frequency", - "--limit", "0", + "--limit", + "0", tmp, - "--output", qsv_freq_csv, + "--output", + qsv_freq_csv, ] try: qsv_freq = subprocess.run(qsv_freq_cmd, check=True) From 60d9d7306f22bcba307c9f7a28bb350d7e228553 Mon Sep 17 00:00:00 2001 From: Andres Vazquez Date: Thu, 6 Nov 2025 13:21:53 -0300 Subject: [PATCH 3/3] Replace 'extra' column with 'rank' in frequency table --- ckanext/datapusher_plus/jobs.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ckanext/datapusher_plus/jobs.py b/ckanext/datapusher_plus/jobs.py index 9bd9f57c..f1e7767e 100644 --- a/ckanext/datapusher_plus/jobs.py +++ b/ckanext/datapusher_plus/jobs.py @@ -1366,15 +1366,11 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None): value TEXT, count INTEGER, percentage FLOAT, - extra TEXT, + rank FLOAT, PRIMARY KEY (field, value, count) ) """ ).format(freq_table, freq_table) - # Could not copy frequency data to database: - # extra data after last expected column - # CONTEXT: COPY 737a3aad-c1c8-4507-8411-2dc6ca176f84-druf-freq, - # line 2: "country,Afghanistan,6,2.52101,1" ) # Copy frequency CSV to /tmp directory for debugging purposes