Skip to content

Commit 30bc789

Browse files
authored
Add camstat docs (#606)
1 parent ea81d04 commit 30bc789

File tree

1 file changed

+165
-0
lines changed

1 file changed

+165
-0
lines changed

ckanext/querytool/commands/camstat.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,23 @@
11
# -*- coding: utf-8 -*-
22

3+
'''
4+
USAGE: paster --plugin=ckanext-querytool update_camstat --config=/path/to/production.ini
5+
6+
`paster` is a command provided by [The Pylons Project](https://docs.pylonsproject.org/en/latest/),
7+
a collection of tools and utilities for Python and used extensively in CKAN. It allows you to run
8+
scripts in various locations, all through a unified utility.
9+
10+
For example, CKAN and any extension installed along with it can have scripts for miscellaneous
11+
purpose. `paster` makes it easy to run any of these scripts. You only need to provide the plugin
12+
name (`ckan` for CKAN core scripts or `ckanext-EXTENSION_NAME` for extensions), the command, and
13+
the path to your `.ini` CKAN config file.
14+
15+
**Note**: On first run, the script will create the organization (if it doesn't exist) and datasets,
16+
as well as upload all data into resources. Every time you run the script after that, it will first
17+
check if there are changes to the data. If nothing changes, the script won't do anything. If any of
18+
the source data has changed, only those datasets will be updated.
19+
'''
20+
321
from __future__ import print_function
422
import requests
523
import csv
@@ -32,6 +50,8 @@ class UpdateCamstat(CkanCommand):
3250
cleans it, and creates datasets and resources in CKAN if they don't exist.
3351
If they do exist, it compares the new checksum with a stored checksum and
3452
updates them if there are changes.
53+
54+
This class contains the main command (`command`) and table setup (`setup_tables`) functions.
3555
'''
3656

3757
summary = __doc__.split('\n')[0]
@@ -40,6 +60,19 @@ class UpdateCamstat(CkanCommand):
4060
min_args = 0
4161

4262
def command(self):
63+
'''
64+
Adds the command to the CKAN `paster` ecosystem.
65+
66+
This is how CKAN knows what to do when you use:
67+
68+
paster --plugin=ckanext-querytool update_camstat --config=/path/to/production.ini
69+
70+
71+
`command` does 2 things:
72+
- If a hash table doesn't exist, create it. Hashes are generated and
73+
stored to know if a data needs to be updated due to changes.
74+
- Calls `update_camstat` (which calls all other functions as needed).
75+
'''
4376
self._load_config()
4477
self.owner_org = 'camstat'
4578
self.languages = ['en', 'km']
@@ -66,6 +99,16 @@ def command(self):
6699
update_camstat(self.owner_org, self.languages)
67100

68101
def setup_tables(self):
102+
'''
103+
When called, this function creates a table in the DB, `camstat_hashes`,
104+
to store distinct hashes of the data.
105+
106+
When the script is run again later:
107+
- New hashes are generated and compared to the old hashes before updating
108+
each dataset.
109+
- If the hash has changed for a given dataset, that dataset is updated with
110+
the latest data and the new hash is saved to the DB.
111+
'''
69112
model.Session.remove()
70113
model.Session.configure(bind=model.meta.engine)
71114

@@ -104,6 +147,27 @@ def setup_tables(self):
104147
metadata.create_all(model.meta.engine)
105148

106149

150+
'''
151+
The following functions are only used for testing/debugging. They will run when the command is used
152+
_only when uncommented_ in the `command` function:
153+
154+
```
155+
# The following functions can be
156+
# used for testing and debugging
157+
#
158+
# purge_datasets(self.owner_org)
159+
# purge_organization(self.owner_org)
160+
# drop_table()
161+
```
162+
163+
- `purge_datasets` - Deletes all current Camstat datasets
164+
- `drop_table` - Removes the hash table from the DB
165+
- `purge_organization` - Deletes the Camstat Organization from CKAN
166+
167+
Start of testing/debugging functions.
168+
'''
169+
170+
107171
def purge_datasets(owner_org):
108172
print(
109173
'> PURGING ALL DATASETS FOR ORGANIZATION: {}\n'
@@ -153,13 +217,38 @@ def purge_organization(owner_org):
153217
print(' ======================================\n')
154218

155219

220+
'''
221+
End of testing/debugging functions.
222+
'''
223+
224+
156225
def utf_8_encoder(unicode_csv_data):
226+
'''
227+
A small helper function to fix any improperly encoded data.
228+
Currently **deprecated**.
229+
'''
157230
for line in unicode_csv_data:
158231
yield line.encode('utf-8')
159232

160233

161234
def clean_csv(data, id_removal, dataflow_agency,
162235
dataflow_id, dataflow_version):
236+
'''
237+
Cleans the data.
238+
239+
It handles the following cases:
240+
- Reformats inconsistent values
241+
- Remove special keywords before the values (e.g. removes `SOME_VALUE: ` in `SOME_VALUE: Some Value`)
242+
- Removes `NA` values
243+
- Wraps strings containing `,` in quotes to avoid being treated as a new column
244+
(e.g. adds quotes to values like "One, two, and three", otherwise, each item will be considered a new column)
245+
- Removes unused IDs
246+
- Converts all headers from fully uppercase to titles (e.g. `HEADER 1` -> `Header 1`)
247+
- Removes empty columns
248+
249+
Once cleaning is done, the function calls `pivot_data` before returning the final data
250+
(see the next section for more information regarding `pivot_data`).
251+
'''
163252
print(' + Cleaning CSV data for: {}'.format(dataflow_id))
164253

165254
cleaned = 0
@@ -242,6 +331,11 @@ def clean_csv(data, id_removal, dataflow_agency,
242331
return data
243332

244333
def pivot_data(data):
334+
'''
335+
Pivots the data from a wider and less usable format to a cleaner, vertical CSV, with one observation per row.
336+
The main issue with the original format is that the column headers we need for visualizations are in a single
337+
column themselves, instead of at the top as headers.
338+
'''
245339
print(' + Pivoting data...')
246340

247341
df = pd.DataFrame(data)
@@ -263,6 +357,9 @@ def pivot_data(data):
263357

264358

265359
def compare_hashes(existing_hash, new_hash):
360+
'''
361+
Compares a new dataset/resource hash with an existing hash (if one exists) and returns `True` or `False`.
362+
'''
266363
print(' + Comparing hashes...')
267364
print(' + Existing hash: {}'.format(existing_hash))
268365
print(' + New hash: {}'.format(new_hash))
@@ -279,6 +376,9 @@ def compare_hashes(existing_hash, new_hash):
279376

280377

281378
def upload_resource(dataflow_name_munged, dataflow_title, resource):
379+
'''
380+
Uploads a new resource to a given dataset.
381+
'''
282382
print(' + Uploading resource to dataset: {}'.format(dataflow_title))
283383

284384
try:
@@ -300,6 +400,9 @@ def upload_resource(dataflow_name_munged, dataflow_title, resource):
300400

301401

302402
def patch_resource(dataflow_name_munged, dataflow_title, resource, resource_id):
403+
'''
404+
Updates the data in a resource for a given dataset.
405+
'''
303406
print(' + Updating resource: {}'.format(dataflow_title))
304407

305408
try:
@@ -323,6 +426,9 @@ def patch_resource(dataflow_name_munged, dataflow_title, resource, resource_id):
323426

324427
def create_dataset(dataflow_name_munged, owner_org,
325428
dataflow_title, dataflow_description):
429+
'''
430+
Creates a new dataset for a given dataflow.
431+
'''
326432
print(' + Creating dataset: {}'.format(dataflow_name_munged))
327433

328434
try:
@@ -354,6 +460,9 @@ def create_dataset(dataflow_name_munged, owner_org,
354460

355461
def patch_dataset(dataflow_name_munged, owner_org,
356462
dataflow_title, dataflow_description):
463+
'''
464+
Updates a dataset for a given dataflow.
465+
'''
357466
print(' + Updating dataset: {}'.format(dataflow_name_munged))
358467

359468
try:
@@ -376,6 +485,9 @@ def patch_dataset(dataflow_name_munged, owner_org,
376485

377486

378487
def verify_organization_exists(owner_org):
488+
'''
489+
Checks if the Camstat organization exists. If not, it creates the organization, otherwise, it's skipped.
490+
'''
379491
print('> VERIFYING ORGANIZATION {} EXISTS...'.format(owner_org))
380492

381493
try:
@@ -400,6 +512,12 @@ def verify_organization_exists(owner_org):
400512

401513
def prepare_dataflow_description(dataflow_description, dataflow_id,
402514
dataflow_last_extracted):
515+
'''
516+
Builds the dataset descriptions with the dataflow ID and current time. For example:
517+
>**Extracted from**: _DF_NUTRITION_
518+
>
519+
>**Last extracted**: _2022-09-28 07:56 PM (UTC)_
520+
'''
403521
print('\n + Preparing description...')
404522

405523
if dataflow_description:
@@ -424,6 +542,9 @@ def prepare_dataflow_description(dataflow_description, dataflow_id,
424542

425543
def get_data(dataflow_agency, dataflow_id,
426544
dataflow_version, data_type):
545+
'''
546+
Retrieves the raw data from the [Camstat data API](https://nsiws-stable-camstat-live.officialstatistics.org).
547+
'''
427548
if data_type == 'both':
428549
print(' + Retrieving raw data...')
429550

@@ -468,6 +589,10 @@ def get_data(dataflow_agency, dataflow_id,
468589

469590

470591
def write_csv(data, csv_filename):
592+
'''
593+
Creates a temporary CSV file from the raw data. This is then cleaned, transformed,
594+
and uploaded as a resource, as mentioned in the other steps.
595+
'''
471596
print(' + Writing CSV data to temporary file...')
472597

473598
tmp_file = tempfile.NamedTemporaryFile('w+b')
@@ -495,6 +620,10 @@ def write_csv(data, csv_filename):
495620

496621

497622
def get_dataflows():
623+
'''
624+
Retrieves the list dataflow IDs/names from the Health and Nutrition topic of the Camstat API.
625+
This is used to retrieve the raw data for each dataflow.
626+
'''
498627
print(
499628
'\n ======================================\n\n'
500629
'> RETRIEVING DATAFLOWS...'
@@ -567,6 +696,9 @@ def get_dataflows():
567696

568697

569698
def get_new_hash(data):
699+
'''
700+
Generates a new data hash. This is compared to the existing data hash later.
701+
'''
570702
print(' + Generating new hash...')
571703

572704
new_hash = hashlib.sha256(str(data).encode('utf-8')).hexdigest()
@@ -577,6 +709,9 @@ def get_new_hash(data):
577709

578710

579711
def get_existing_hash(dataflow_id):
712+
'''
713+
Retrieves the existing data hash for a given resource (if exists already).
714+
'''
580715
print(' + Retrieving existing hash for: {}'.format(dataflow_id))
581716

582717
connection = model.Session.connection()
@@ -606,6 +741,10 @@ def get_existing_hash(dataflow_id):
606741

607742
def update_hash(dataflow_id, dataflow_name_munged, resource_id,
608743
new_hash, dataflow_last_updated, existing_hash, was_deleted):
744+
'''
745+
Updates the data hash in the DB if there was a change. This function will also
746+
remove a hash if the resource/dataset is deleted.
747+
'''
609748
print(' + Updating hashes...')
610749

611750
connection = model.Session.connection()
@@ -648,6 +787,32 @@ def update_hash(dataflow_id, dataflow_name_munged, resource_id,
648787

649788

650789
def update_camstat(owner_org, languages):
790+
'''
791+
This function contains the overall process code. It's where most of the previous
792+
functions get called when needed.
793+
794+
Here's the flow of the process:
795+
- Retrieves all dataflows using `get_dataflows`
796+
- Iterates over the dataflows:
797+
- Extracts the metadata from the dataflow
798+
(title, ID, name, agency, description)
799+
- Retrieves the current time
800+
(used for "Last extracted")
801+
- Calls `prepare_dataflow_description`
802+
(this will be added to the dataset object later in the process)
803+
- Retrieves the raw data using `get_data`
804+
- Cleans the retrieved raw data using `clean_csv`
805+
- Write the temporary CSV using `write_csv`
806+
- Generate a new data hash using `get_new_hash`
807+
- Retrieve the existing data hash (if exists)
808+
- Check if the new data hash and existing data hash differ
809+
- If no existing data hash exists:
810+
- Create a new dataset and resource
811+
- If the data hashes match:
812+
- The data is up-to-date, move on to the next dataflow
813+
- If they differ:
814+
- Update the resource in the dataset with the latest data
815+
'''
651816
verify_organization_exists(owner_org)
652817

653818
# languages = ['km', 'en']

0 commit comments

Comments
 (0)