11# -*- coding: utf-8 -*-
22
3+ '''
4+ USAGE: paster --plugin=ckanext-querytool update_camstat --config=/path/to/production.ini
5+
6+ `paster` is a command provided by [The Pylons Project](https://docs.pylonsproject.org/en/latest/),
7+ a collection of tools and utilities for Python and used extensively in CKAN. It allows you to run
8+ scripts in various locations, all through a unified utility.
9+
10+ For example, CKAN and any extension installed along with it can have scripts for miscellaneous
11+ purpose. `paster` makes it easy to run any of these scripts. You only need to provide the plugin
12+ name (`ckan` for CKAN core scripts or `ckanext-EXTENSION_NAME` for extensions), the command, and
13+ the path to your `.ini` CKAN config file.
14+
15+ **Note**: On first run, the script will create the organization (if it doesn't exist) and datasets,
16+ as well as upload all data into resources. Every time you run the script after that, it will first
17+ check if there are changes to the data. If nothing changes, the script won't do anything. If any of
18+ the source data has changed, only those datasets will be updated.
19+ '''
20+
321from __future__ import print_function
422import requests
523import csv
@@ -32,6 +50,8 @@ class UpdateCamstat(CkanCommand):
3250 cleans it, and creates datasets and resources in CKAN if they don't exist.
3351 If they do exist, it compares the new checksum with a stored checksum and
3452 updates them if there are changes.
53+
54+ This class contains the main command (`command`) and table setup (`setup_tables`) functions.
3555 '''
3656
3757 summary = __doc__ .split ('\n ' )[0 ]
@@ -40,6 +60,19 @@ class UpdateCamstat(CkanCommand):
4060 min_args = 0
4161
4262 def command (self ):
63+ '''
64+ Adds the command to the CKAN `paster` ecosystem.
65+
66+ This is how CKAN knows what to do when you use:
67+
68+ paster --plugin=ckanext-querytool update_camstat --config=/path/to/production.ini
69+
70+
71+ `command` does 2 things:
72+ - If a hash table doesn't exist, create it. Hashes are generated and
73+ stored to know if a data needs to be updated due to changes.
74+ - Calls `update_camstat` (which calls all other functions as needed).
75+ '''
4376 self ._load_config ()
4477 self .owner_org = 'camstat'
4578 self .languages = ['en' , 'km' ]
@@ -66,6 +99,16 @@ def command(self):
6699 update_camstat (self .owner_org , self .languages )
67100
68101 def setup_tables (self ):
102+ '''
103+ When called, this function creates a table in the DB, `camstat_hashes`,
104+ to store distinct hashes of the data.
105+
106+ When the script is run again later:
107+ - New hashes are generated and compared to the old hashes before updating
108+ each dataset.
109+ - If the hash has changed for a given dataset, that dataset is updated with
110+ the latest data and the new hash is saved to the DB.
111+ '''
69112 model .Session .remove ()
70113 model .Session .configure (bind = model .meta .engine )
71114
@@ -104,6 +147,27 @@ def setup_tables(self):
104147 metadata .create_all (model .meta .engine )
105148
106149
150+ '''
151+ The following functions are only used for testing/debugging. They will run when the command is used
152+ _only when uncommented_ in the `command` function:
153+
154+ ```
155+ # The following functions can be
156+ # used for testing and debugging
157+ #
158+ # purge_datasets(self.owner_org)
159+ # purge_organization(self.owner_org)
160+ # drop_table()
161+ ```
162+
163+ - `purge_datasets` - Deletes all current Camstat datasets
164+ - `drop_table` - Removes the hash table from the DB
165+ - `purge_organization` - Deletes the Camstat Organization from CKAN
166+
167+ Start of testing/debugging functions.
168+ '''
169+
170+
107171def purge_datasets (owner_org ):
108172 print (
109173 '> PURGING ALL DATASETS FOR ORGANIZATION: {}\n '
@@ -153,13 +217,38 @@ def purge_organization(owner_org):
153217 print (' ======================================\n ' )
154218
155219
220+ '''
221+ End of testing/debugging functions.
222+ '''
223+
224+
156225def utf_8_encoder (unicode_csv_data ):
226+ '''
227+ A small helper function to fix any improperly encoded data.
228+ Currently **deprecated**.
229+ '''
157230 for line in unicode_csv_data :
158231 yield line .encode ('utf-8' )
159232
160233
161234def clean_csv (data , id_removal , dataflow_agency ,
162235 dataflow_id , dataflow_version ):
236+ '''
237+ Cleans the data.
238+
239+ It handles the following cases:
240+ - Reformats inconsistent values
241+ - Remove special keywords before the values (e.g. removes `SOME_VALUE: ` in `SOME_VALUE: Some Value`)
242+ - Removes `NA` values
243+ - Wraps strings containing `,` in quotes to avoid being treated as a new column
244+ (e.g. adds quotes to values like "One, two, and three", otherwise, each item will be considered a new column)
245+ - Removes unused IDs
246+ - Converts all headers from fully uppercase to titles (e.g. `HEADER 1` -> `Header 1`)
247+ - Removes empty columns
248+
249+ Once cleaning is done, the function calls `pivot_data` before returning the final data
250+ (see the next section for more information regarding `pivot_data`).
251+ '''
163252 print (' + Cleaning CSV data for: {}' .format (dataflow_id ))
164253
165254 cleaned = 0
@@ -242,6 +331,11 @@ def clean_csv(data, id_removal, dataflow_agency,
242331 return data
243332
244333def pivot_data (data ):
334+ '''
335+ Pivots the data from a wider and less usable format to a cleaner, vertical CSV, with one observation per row.
336+ The main issue with the original format is that the column headers we need for visualizations are in a single
337+ column themselves, instead of at the top as headers.
338+ '''
245339 print (' + Pivoting data...' )
246340
247341 df = pd .DataFrame (data )
@@ -263,6 +357,9 @@ def pivot_data(data):
263357
264358
265359def compare_hashes (existing_hash , new_hash ):
360+ '''
361+ Compares a new dataset/resource hash with an existing hash (if one exists) and returns `True` or `False`.
362+ '''
266363 print (' + Comparing hashes...' )
267364 print (' + Existing hash: {}' .format (existing_hash ))
268365 print (' + New hash: {}' .format (new_hash ))
@@ -279,6 +376,9 @@ def compare_hashes(existing_hash, new_hash):
279376
280377
281378def upload_resource (dataflow_name_munged , dataflow_title , resource ):
379+ '''
380+ Uploads a new resource to a given dataset.
381+ '''
282382 print (' + Uploading resource to dataset: {}' .format (dataflow_title ))
283383
284384 try :
@@ -300,6 +400,9 @@ def upload_resource(dataflow_name_munged, dataflow_title, resource):
300400
301401
302402def patch_resource (dataflow_name_munged , dataflow_title , resource , resource_id ):
403+ '''
404+ Updates the data in a resource for a given dataset.
405+ '''
303406 print (' + Updating resource: {}' .format (dataflow_title ))
304407
305408 try :
@@ -323,6 +426,9 @@ def patch_resource(dataflow_name_munged, dataflow_title, resource, resource_id):
323426
324427def create_dataset (dataflow_name_munged , owner_org ,
325428 dataflow_title , dataflow_description ):
429+ '''
430+ Creates a new dataset for a given dataflow.
431+ '''
326432 print (' + Creating dataset: {}' .format (dataflow_name_munged ))
327433
328434 try :
@@ -354,6 +460,9 @@ def create_dataset(dataflow_name_munged, owner_org,
354460
355461def patch_dataset (dataflow_name_munged , owner_org ,
356462 dataflow_title , dataflow_description ):
463+ '''
464+ Updates a dataset for a given dataflow.
465+ '''
357466 print (' + Updating dataset: {}' .format (dataflow_name_munged ))
358467
359468 try :
@@ -376,6 +485,9 @@ def patch_dataset(dataflow_name_munged, owner_org,
376485
377486
378487def verify_organization_exists (owner_org ):
488+ '''
489+ Checks if the Camstat organization exists. If not, it creates the organization, otherwise, it's skipped.
490+ '''
379491 print ('> VERIFYING ORGANIZATION {} EXISTS...' .format (owner_org ))
380492
381493 try :
@@ -400,6 +512,12 @@ def verify_organization_exists(owner_org):
400512
401513def prepare_dataflow_description (dataflow_description , dataflow_id ,
402514 dataflow_last_extracted ):
515+ '''
516+ Builds the dataset descriptions with the dataflow ID and current time. For example:
517+ >**Extracted from**: _DF_NUTRITION_
518+ >
519+ >**Last extracted**: _2022-09-28 07:56 PM (UTC)_
520+ '''
403521 print ('\n + Preparing description...' )
404522
405523 if dataflow_description :
@@ -424,6 +542,9 @@ def prepare_dataflow_description(dataflow_description, dataflow_id,
424542
425543def get_data (dataflow_agency , dataflow_id ,
426544 dataflow_version , data_type ):
545+ '''
546+ Retrieves the raw data from the [Camstat data API](https://nsiws-stable-camstat-live.officialstatistics.org).
547+ '''
427548 if data_type == 'both' :
428549 print (' + Retrieving raw data...' )
429550
@@ -468,6 +589,10 @@ def get_data(dataflow_agency, dataflow_id,
468589
469590
470591def write_csv (data , csv_filename ):
592+ '''
593+ Creates a temporary CSV file from the raw data. This is then cleaned, transformed,
594+ and uploaded as a resource, as mentioned in the other steps.
595+ '''
471596 print (' + Writing CSV data to temporary file...' )
472597
473598 tmp_file = tempfile .NamedTemporaryFile ('w+b' )
@@ -495,6 +620,10 @@ def write_csv(data, csv_filename):
495620
496621
497622def get_dataflows ():
623+ '''
624+ Retrieves the list dataflow IDs/names from the Health and Nutrition topic of the Camstat API.
625+ This is used to retrieve the raw data for each dataflow.
626+ '''
498627 print (
499628 '\n ======================================\n \n '
500629 '> RETRIEVING DATAFLOWS...'
@@ -567,6 +696,9 @@ def get_dataflows():
567696
568697
569698def get_new_hash (data ):
699+ '''
700+ Generates a new data hash. This is compared to the existing data hash later.
701+ '''
570702 print (' + Generating new hash...' )
571703
572704 new_hash = hashlib .sha256 (str (data ).encode ('utf-8' )).hexdigest ()
@@ -577,6 +709,9 @@ def get_new_hash(data):
577709
578710
579711def get_existing_hash (dataflow_id ):
712+ '''
713+ Retrieves the existing data hash for a given resource (if exists already).
714+ '''
580715 print (' + Retrieving existing hash for: {}' .format (dataflow_id ))
581716
582717 connection = model .Session .connection ()
@@ -606,6 +741,10 @@ def get_existing_hash(dataflow_id):
606741
607742def update_hash (dataflow_id , dataflow_name_munged , resource_id ,
608743 new_hash , dataflow_last_updated , existing_hash , was_deleted ):
744+ '''
745+ Updates the data hash in the DB if there was a change. This function will also
746+ remove a hash if the resource/dataset is deleted.
747+ '''
609748 print (' + Updating hashes...' )
610749
611750 connection = model .Session .connection ()
@@ -648,6 +787,32 @@ def update_hash(dataflow_id, dataflow_name_munged, resource_id,
648787
649788
650789def update_camstat (owner_org , languages ):
790+ '''
791+ This function contains the overall process code. It's where most of the previous
792+ functions get called when needed.
793+
794+ Here's the flow of the process:
795+ - Retrieves all dataflows using `get_dataflows`
796+ - Iterates over the dataflows:
797+ - Extracts the metadata from the dataflow
798+ (title, ID, name, agency, description)
799+ - Retrieves the current time
800+ (used for "Last extracted")
801+ - Calls `prepare_dataflow_description`
802+ (this will be added to the dataset object later in the process)
803+ - Retrieves the raw data using `get_data`
804+ - Cleans the retrieved raw data using `clean_csv`
805+ - Write the temporary CSV using `write_csv`
806+ - Generate a new data hash using `get_new_hash`
807+ - Retrieve the existing data hash (if exists)
808+ - Check if the new data hash and existing data hash differ
809+ - If no existing data hash exists:
810+ - Create a new dataset and resource
811+ - If the data hashes match:
812+ - The data is up-to-date, move on to the next dataflow
813+ - If they differ:
814+ - Update the resource in the dataset with the latest data
815+ '''
651816 verify_organization_exists (owner_org )
652817
653818 # languages = ['km', 'en']
0 commit comments