From c32eff38f0c1016c19b5be175c4ece8f96a28a78 Mon Sep 17 00:00:00 2001 From: mcarans Date: Thu, 9 Oct 2025 16:13:03 +1300 Subject: [PATCH 1/6] Remove unused generate_qc_resource_from_rows method, test and documentation Use save_iterable in generate_resource_from_rows --- documentation/index.md | 12 +--- pyproject.toml | 4 +- requirements.txt | 57 ++++++++------- src/hdx/data/dataset.py | 72 ++++--------------- .../data/test_dataset_resource_generation.py | 62 ---------------- 5 files changed, 46 insertions(+), 161 deletions(-) diff --git a/documentation/index.md b/documentation/index.md index 771455a..eeea5e8 100755 --- a/documentation/index.md +++ b/documentation/index.md @@ -54,6 +54,8 @@ The library has detailed API documentation which can be found in the menu at the ## Breaking Changes +From 6., remove unused `generate_qc_resource_from_rows` method. + From 6.5.0, files will not be uploaded to the HDX filestore if the hash and size have not changed, but if there are any resource metadata changes, except for last_modified, they will still take place. @@ -824,16 +826,6 @@ treated as containing values: dataset.generate_resource_from_rows("FOLDER", "FILENAME", ROWS, RESOURCE DATA, HEADERS, "ENCODING") -A resource for the purpose of driving QuickCharts can be generated by taking -ROWS, a list of dictionaries, and producing a cut down subset from it. HXLTAGS -are added as the second row after the header. The reduction in rows is -performed by only outputting the rows where COLUMN_NAME has a value in -QC_IDENTIFIERS. Optionally the columns that are output can be limited by -specifying them in HEADERS. - - dataset.generate_qc_resource_from_rows("FOLDER", "FILENAME", ROWS, - RESOURCE DATA, HXLTAGS, "COLUMN_NAME", QC_IDENTIFIERS, HEADERS, "ENCODING") - Building on these basic resource generation methods, there are more powerful ones `generate_resource_from_iterator` and `download_and_generate_resource`. diff --git a/pyproject.toml b/pyproject.toml index 7e5864f..fbba8e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,8 +37,8 @@ dependencies = [ "ckanapi>=4.8", "defopt>=7.0.0", "email_validator", - "hdx-python-country>=3.9.6", - "hdx-python-utilities>=3.9.1", + "hdx-python-country>=3.9.8", + "hdx-python-utilities>=3.9.4", "libhxl>=5.2.2", "makefun", "quantulum3", diff --git a/requirements.txt b/requirements.txt index b5b5547..a4ef5f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ annotated-types==0.7.0 # via pydantic astdoc==1.3.2 # via mkapi -attrs==25.3.0 +attrs==25.4.0 # via # frictionless # jsonlines @@ -14,9 +14,9 @@ babel==2.17.0 # via mkdocs-material backrefs==5.9 # via mkdocs-material -cachetools==5.5.2 +cachetools==6.2.0 # via google-auth -certifi==2025.8.3 +certifi==2025.10.5 # via requests cfgv==3.4.0 # via pre-commit @@ -26,38 +26,37 @@ charset-normalizer==3.4.3 # via requests ckanapi==4.8 # via hdx-python-api (pyproject.toml) -click==8.2.1 +click==8.3.0 # via # mkdocs - # mkdocs-material # typer colorama==0.4.6 # via mkdocs-material -coverage==7.10.6 +coverage==7.10.7 # via pytest-cov defopt==7.0.0 # via hdx-python-api (pyproject.toml) distlib==0.4.0 # via virtualenv -dnspython==2.7.0 +dnspython==2.8.0 # via email-validator docopt==0.6.2 # via # ckanapi # num2words -docutils==0.22 +docutils==0.22.2 # via defopt email-validator==2.3.0 # via hdx-python-api (pyproject.toml) et-xmlfile==2.0.0 # via openpyxl -filelock==3.19.1 +filelock==3.20.0 # via virtualenv frictionless==5.18.1 # via hdx-python-utilities ghp-import==2.1.0 # via mkdocs -google-auth==2.40.3 +google-auth==2.41.1 # via # google-auth-oauthlib # gspread @@ -65,15 +64,15 @@ google-auth-oauthlib==1.2.2 # via gspread gspread==6.2.1 # via hdx-python-api (pyproject.toml) -hdx-python-country==3.9.6 +hdx-python-country==3.9.8 # via hdx-python-api (pyproject.toml) -hdx-python-utilities==3.9.1 +hdx-python-utilities==3.9.4 # via # hdx-python-api (pyproject.toml) # hdx-python-country humanize==4.13.0 # via frictionless -identify==2.6.13 +identify==2.6.15 # via pre-commit idna==3.10 # via @@ -101,7 +100,7 @@ jsonschema==4.25.1 # via # frictionless # tableschema-to-template -jsonschema-specifications==2025.4.1 +jsonschema-specifications==2025.9.1 # via jsonschema libhxl==5.2.2 # via @@ -111,7 +110,7 @@ loguru==0.7.3 # via hdx-python-utilities makefun==1.16.0 # via hdx-python-api (pyproject.toml) -markdown==3.8.2 +markdown==3.9 # via # mkdocs # mkdocs-material @@ -120,7 +119,7 @@ markdown-it-py==4.0.0 # via rich marko==2.2.0 # via frictionless -markupsafe==3.0.2 +markupsafe==3.0.3 # via # jinja2 # mkdocs @@ -138,7 +137,7 @@ mkdocs==1.6.1 # mkdocs-material mkdocs-get-deps==0.2.0 # via mkdocs -mkdocs-material==9.6.18 +mkdocs-material==9.6.21 # via mkapi mkdocs-material-extensions==1.3.1 # via mkdocs-material @@ -162,7 +161,7 @@ pathspec==0.12.1 # via mkdocs petl==1.7.17 # via frictionless -platformdirs==4.4.0 +platformdirs==4.5.0 # via # mkdocs-get-deps # virtualenv @@ -184,9 +183,9 @@ pyasn1==0.6.1 # rsa pyasn1-modules==0.4.2 # via google-auth -pydantic==2.11.7 +pydantic==2.12.0 # via frictionless -pydantic-core==2.33.2 +pydantic-core==2.41.1 # via pydantic pygments==2.19.2 # via @@ -197,14 +196,14 @@ pymdown-extensions==10.16.1 # via mkdocs-material pyphonetics==0.5.3 # via hdx-python-utilities -pytest==8.4.1 +pytest==8.4.2 # via # hdx-python-api (pyproject.toml) # pytest-check # pytest-cov -pytest-check==2.5.3 +pytest-check==2.6.0 # via hdx-python-api (pyproject.toml) -pytest-cov==6.2.1 +pytest-cov==7.0.0 # via hdx-python-api (pyproject.toml) python-dateutil==2.9.0.post0 # via @@ -218,7 +217,7 @@ python-slugify==8.0.4 # via # ckanapi # frictionless -pyyaml==6.0.2 +pyyaml==6.0.3 # via # frictionless # mkdocs @@ -262,7 +261,7 @@ rsa==4.9.1 # via google-auth ruamel-yaml==0.18.15 # via hdx-python-utilities -ruamel-yaml-clib==0.2.12 +ruamel-yaml-clib==0.2.14 # via ruamel-yaml setuptools==80.9.0 # via ckanapi @@ -270,7 +269,7 @@ shellingham==1.5.4 # via typer simpleeval==1.0.3 # via frictionless -simplejson==3.20.1 +simplejson==3.20.2 # via ckanapi six==1.17.0 # via @@ -292,7 +291,7 @@ text-unidecode==1.3 # via python-slugify typeguard==4.4.4 # via inflect -typer==0.17.3 +typer==0.19.2 # via frictionless typing-extensions==4.15.0 # via @@ -302,7 +301,7 @@ typing-extensions==4.15.0 # typeguard # typer # typing-inspection -typing-inspection==0.4.1 +typing-inspection==0.4.2 # via pydantic unidecode==1.4.0 # via @@ -326,7 +325,7 @@ xlrd3==1.1.0 # via libhxl xlsx2csv==0.8.4 # via hdx-python-utilities -xlsxwriter==3.2.5 +xlsxwriter==3.2.9 # via tableschema-to-template xlwt==1.3.0 # via hdx-python-utilities diff --git a/src/hdx/data/dataset.py b/src/hdx/data/dataset.py index f3d06a8..b51586f 100755 --- a/src/hdx/data/dataset.py +++ b/src/hdx/data/dataset.py @@ -45,11 +45,11 @@ parse_date, parse_date_range, ) -from hdx.utilities.dictandlist import merge_two_dictionaries, write_list_to_csv +from hdx.utilities.dictandlist import merge_two_dictionaries from hdx.utilities.downloader import Download from hdx.utilities.loader import load_json from hdx.utilities.path import script_dir_plus_file -from hdx.utilities.saver import save_json +from hdx.utilities.saver import save_iterable, save_json from hdx.utilities.typehint import ListTuple, ListTupleDict from hdx.utilities.uuid import is_valid_uuid @@ -2619,11 +2619,12 @@ def generate_resource_from_rows( self, folder: str, filename: str, - rows: List[ListTupleDict], + rows: Iterable[ListTupleDict], resourcedata: Dict, headers: Optional[ListTuple[str]] = None, + format: str = "csv", encoding: Optional[str] = None, - ) -> "Resource": + ) -> Optional["Resource"]: """Write rows to csv and create resource, adding it to the dataset. The headers argument is either a row number (rows start counting at 1), or the actual headers defined as a list of strings. If not set, all @@ -2632,72 +2633,27 @@ def generate_resource_from_rows( Args: folder (str): Folder to which to write file containing rows filename (str): Filename of file to write rows - rows (List[ListTupleDict]): List of rows in dict or list form + rows (Iterable[ListTupleDict]): List of rows in dict or list form resourcedata (Dict): Resource data headers (Optional[ListTuple[str]]): List of headers. Defaults to None. + format (str): Format to write. Defaults to csv. encoding (Optional[str]): Encoding to use. Defaults to None (infer encoding). Returns: - Resource: The created resource + Optional[Resource]: The created resource or None if not created """ filepath = join(folder, filename) - write_list_to_csv(filepath, rows, columns=headers, encoding=encoding) + res = save_iterable( + filepath, rows, columns=headers, format=format, encoding=encoding + ) + if not res: + return None resource = res_module.Resource(resourcedata) - resource.set_format("csv") + resource.set_format(format) resource.set_file_to_upload(filepath) self.add_update_resource(resource) return resource - def generate_qc_resource_from_rows( - self, - folder: str, - filename: str, - rows: List[Dict], - resourcedata: Dict, - hxltags: Dict[str, str], - columnname: str, - qc_identifiers: ListTuple[str], - headers: Optional[ListTuple[str]] = None, - encoding: Optional[str] = None, - ) -> Optional["Resource"]: - """Generate QuickCharts rows by cutting down input rows by relevant - identifiers and optionally restricting to certain columns. Output to - csv and create resource, adding it to the dataset. - - Args: - folder (str): Folder to which to write file containing rows - filename (str): Filename of file to write rows - rows (List[Dict]): List of rows in dict form - resourcedata (Dict): Resource data - hxltags (Dict[str,str]): Header to HXL hashtag mapping - columnname (str): Name of column containing identifier - qc_identifiers (ListTuple[str]): List of ids to match - headers (Optional[ListTuple[str]]): List of headers to output. Defaults to None (all headers). - encoding (Optional[str]): Encoding to use. Defaults to None (infer encoding). - - Returns: - Optional[Resource]: The created resource or None - """ - qc_rows = [] - for row in rows: - if row[columnname] in qc_identifiers: - if headers: - qcrow = {x: row[x] for x in headers} - else: - qcrow = row - qc_rows.append(qcrow) - if len(qc_rows) == 0: - return None - qc_rows.insert(0, hxltags) - return self.generate_resource_from_rows( - folder, - filename, - qc_rows, - resourcedata, - headers=headers, - encoding=encoding, - ) - def generate_resource_from_iterable( self, headers: ListTuple[str], diff --git a/tests/hdx/data/test_dataset_resource_generation.py b/tests/hdx/data/test_dataset_resource_generation.py index 87e126f..4329346 100644 --- a/tests/hdx/data/test_dataset_resource_generation.py +++ b/tests/hdx/data/test_dataset_resource_generation.py @@ -38,68 +38,6 @@ class TestDatasetResourceGeneration: "ISO3": "#country+code", } - def test_generate_qc_resource_from_rows(self, configuration): - with temp_dir("test") as folder: - with Download(user_agent="test") as downloader: - _, rows = downloader.get_tabular_rows( - TestDatasetResourceGeneration.url, - dict_form=True, - format="csv", - ) - rows = list(rows) - dataset = Dataset({"name": "test"}) - qc_filename = "qc_conflict_data_alg.csv" - resourcedata = { - "name": "Conflict Data for Algeria", - "description": "Conflict data with HXL tags", - } - columnname = "EVENT_ID_CNTY" - qc_indicator_codes = ["1416RTA", "XXXXRTA", "2231RTA"] - resource = dataset.generate_qc_resource_from_rows( - folder, - qc_filename, - rows, - resourcedata, - TestDatasetResourceGeneration.hxltags, - columnname, - qc_indicator_codes, - ) - assert resource == { - "name": "Conflict Data for Algeria", - "description": "Conflict data with HXL tags", - "format": "csv", - } - assert_files_same( - join("tests", "fixtures", "qc_from_rows", qc_filename), - join(folder, qc_filename), - ) - qc_filename = "qc_conflict_data_alg_one_col.csv" - dataset.generate_qc_resource_from_rows( - folder, - qc_filename, - rows, - resourcedata, - TestDatasetResourceGeneration.hxltags, - columnname, - qc_indicator_codes, - headers=[columnname], - ) - assert_files_same( - join("tests", "fixtures", "qc_from_rows", qc_filename), - join(folder, qc_filename), - ) - rows = [] - resource = dataset.generate_qc_resource_from_rows( - folder, - qc_filename, - rows, - resourcedata, - TestDatasetResourceGeneration.hxltags, - columnname, - qc_indicator_codes, - ) - assert resource is None - def test_download_and_generate_resource(self, configuration): with temp_dir("test") as folder: filename = "conflict_data_alg.csv" From 5e94e251602f45c7eae9d2427920b1dd2cb03593 Mon Sep 17 00:00:00 2001 From: mcarans Date: Thu, 9 Oct 2025 16:30:59 +1300 Subject: [PATCH 2/6] Fix test --- tests/hdx/data/test_dataset_noncore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/hdx/data/test_dataset_noncore.py b/tests/hdx/data/test_dataset_noncore.py index 6a5d5b9..38624a0 100755 --- a/tests/hdx/data/test_dataset_noncore.py +++ b/tests/hdx/data/test_dataset_noncore.py @@ -769,7 +769,7 @@ def test_set_quickchart_resource(self, configuration): assert resources[0]["dataset_preview_enabled"] == "True" assert resources[1]["dataset_preview_enabled"] == "False" - def test_quickcharts_resource_last(self): + def test_quickcharts_resource_last(self, configuration): datasetdata = copy.deepcopy(dataset_data) resourcesdata = copy.deepcopy(resources_data) datasetdata["resources"] = resourcesdata From 986f6a62a51ec8edd9fd1ed5243ef4ecedb4a0d8 Mon Sep 17 00:00:00 2001 From: mcarans Date: Fri, 10 Oct 2025 11:10:28 +1300 Subject: [PATCH 3/6] Progress but unfinished --- pyproject.toml | 2 +- requirements.txt | 2 +- src/hdx/data/dataset.py | 250 +++++++- .../conflict_data_alg.csv | 6 + .../min_qc_conflict_data_alg.csv | 0 .../qc_conflict_data_alg.csv | 0 .../test_data_no_data.csv | 0 .../test_data_no_years.csv | 0 .../gen_resource/conflict_data_alg.csv | 1 - tests/hdx/data/test_dataset_noncore.py | 2 +- .../data/test_dataset_resource_generation.py | 545 +++++++++++++++++- 11 files changed, 790 insertions(+), 18 deletions(-) create mode 100755 tests/fixtures/download_gen_resource/conflict_data_alg.csv rename tests/fixtures/{gen_resource => download_gen_resource}/min_qc_conflict_data_alg.csv (100%) rename tests/fixtures/{gen_resource => download_gen_resource}/qc_conflict_data_alg.csv (100%) rename tests/fixtures/{gen_resource => download_gen_resource}/test_data_no_data.csv (100%) rename tests/fixtures/{gen_resource => download_gen_resource}/test_data_no_years.csv (100%) diff --git a/pyproject.toml b/pyproject.toml index fbba8e6..429936f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "defopt>=7.0.0", "email_validator", "hdx-python-country>=3.9.8", - "hdx-python-utilities>=3.9.4", + "hdx-python-utilities>=3.9.5", "libhxl>=5.2.2", "makefun", "quantulum3", diff --git a/requirements.txt b/requirements.txt index a4ef5f7..2c1dc5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -251,7 +251,7 @@ requests-oauthlib==2.0.0 # via google-auth-oauthlib rfc3986==2.0.0 # via frictionless -rich==14.1.0 +rich==14.2.0 # via typer rpds-py==0.27.1 # via diff --git a/src/hdx/data/dataset.py b/src/hdx/data/dataset.py index b51586f..251c602 100755 --- a/src/hdx/data/dataset.py +++ b/src/hdx/data/dataset.py @@ -2615,6 +2615,134 @@ def remove_dates_from_title( self.set_time_period(startdate, enddate) return ranges + def generate_resource( + self, + folder: str, + filename: str, + rows: Iterable[ListTupleDict], + resourcedata: Dict, + headers: Union[int, ListTuple[str], None] = None, + columns: Union[ListTuple[int], ListTuple[str], None] = None, + format: str = "csv", + encoding: Optional[str] = None, + datecol: Optional[Union[int, str]] = None, + yearcol: Optional[Union[int, str]] = None, + date_function: Optional[Callable[[Dict], Optional[Dict]]] = None, + ) -> Tuple[bool, Dict]: + """Write rows to file and create resource, adding it to the dataset. The headers + argument is either a row number (rows start counting at 1), or the actual + headers defined as a list of strings. If not set, all rows will be treated as + containing values. Specific columns to include can be specified (ie. a subset of + the headers). + + The returned dictionary will contain the resource in the key resource, headers + in the key headers and list of rows in the key rows. + + The time period can optionally be set by supplying a column in + which the date or year is to be looked up. Note that any timezone + information is ignored and UTC assumed. Alternatively, a function can + be supplied to handle any dates in a row. It should accept a row and + should return None to ignore the row or a dictionary which can either + be empty if there are no dates in the row or can be populated with + keys startdate and/or enddate which are of type timezone-aware + datetime. The lowest start date and highest end date are used to set + the time period and are returned in the results dictionary in keys + startdate and enddate. + + Args: + folder (str): Folder to which to write file containing rows + filename (str): Filename of file to write rows + rows (Iterable[ListTupleDict]): List of rows in dict or list form + resourcedata (Dict): Resource data + headers (Union[int, ListTuple[str], None]): All headers. Defaults to None. + columns (Union[ListTuple[int], ListTuple[str], None]): Columns to write. Defaults to all. + format (str): Format to write. Defaults to csv. + encoding (Optional[str]): Encoding to use. Defaults to None (infer encoding). + datecol: Optional[Union[int, str]] = None, + yearcol: Optional[Union[int, str]] = None, + date_function: Optional[Callable[[Dict], Optional[Dict]]] = None, + + Returns: + Tuple[bool, Dict]: (True if resource added, dictionary of results) + """ + if [datecol, yearcol, date_function].count(None) < 2: + raise HDXError("Supply one of datecol, yearcol or date_function!") + retdict = {} + dates = [default_enddate, default_date] + + if yearcol is not None: + + def yearcol_function(row): + result = {} + year = row[yearcol] + if year: + result["startdate"], result["enddate"] = parse_date_range( + year, + zero_time=True, + max_endtime=True, + ) + return result + + date_function = yearcol_function + elif datecol is not None: + + def datecol_function(row): + result = {} + date = row[datecol] + if date: + date = parse_date(date) + result["startdate"] = date + result["enddate"] = date + return result + + date_function = datecol_function + + def process_row(row: ListTupleDict) -> Optional[ListTupleDict]: + if date_function is None: + return row + result = date_function(row) + if result is None: + return None + startdate = result.get("startdate") + if startdate is not None: + if startdate < dates[0]: + dates[0] = startdate + enddate = result.get("enddate") + if enddate is not None: + if enddate > dates[1]: + dates[1] = enddate + return row + + filepath = join(folder, filename) + rows = save_iterable( + filepath, + rows, + headers, + columns, + format=format, + encoding=encoding, + row_function=process_row, + ) + if not rows: + logger.error(f"No data rows in {filename}!") + return False, retdict + if yearcol is not None or date_function is not None: + if dates[0] == default_enddate or dates[1] == default_date: + logger.error(f"No dates in {filename}!") + return False, retdict + else: + retdict["startdate"] = dates[0] + retdict["enddate"] = dates[1] + self.set_time_period(dates[0], dates[1]) + resource = res_module.Resource(resourcedata) + resource.set_format(format) + resource.set_file_to_upload(filepath) + self.add_update_resource(resource) + retdict["resource"] = resource + retdict["headers"] = headers + retdict["rows"] = rows + return True, retdict + def generate_resource_from_rows( self, folder: str, @@ -2622,7 +2750,6 @@ def generate_resource_from_rows( rows: Iterable[ListTupleDict], resourcedata: Dict, headers: Optional[ListTuple[str]] = None, - format: str = "csv", encoding: Optional[str] = None, ) -> Optional["Resource"]: """Write rows to csv and create resource, adding it to the dataset. @@ -2636,23 +2763,19 @@ def generate_resource_from_rows( rows (Iterable[ListTupleDict]): List of rows in dict or list form resourcedata (Dict): Resource data headers (Optional[ListTuple[str]]): List of headers. Defaults to None. - format (str): Format to write. Defaults to csv. encoding (Optional[str]): Encoding to use. Defaults to None (infer encoding). Returns: Optional[Resource]: The created resource or None if not created """ - filepath = join(folder, filename) - res = save_iterable( - filepath, rows, columns=headers, format=format, encoding=encoding + warnings.warn( + "generate_resource_from_rows() is deprecated, use generate_resource() instead", + DeprecationWarning, ) - if not res: - return None - resource = res_module.Resource(resourcedata) - resource.set_format(format) - resource.set_file_to_upload(filepath) - self.add_update_resource(resource) - return resource + res, retdict = self.generate_resource( + folder, filename, rows, resourcedata, headers, headers, "csv", encoding + ) + return retdict["resource"] if res else None def generate_resource_from_iterable( self, @@ -2719,6 +2842,10 @@ def generate_resource_from_iterable( Returns: Tuple[bool, Dict]: (True if resource added, dictionary of results) """ + warnings.warn( + "generate_resource_from_iterable() is deprecated, use generate_resource() instead", + DeprecationWarning, + ) if [datecol, yearcol, date_function].count(None) < 2: raise HDXError("Supply one of datecol, yearcol or date_function!") retdict = {} @@ -2877,7 +3004,7 @@ def generate_resource_from_iterator( encoding: Optional[str] = None, ) -> Tuple[bool, Dict]: warnings.warn( - "generate_resource_from_iterator() is deprecated, use generate_resource_from_iterable() instead", + "generate_resource_from_iterator() is deprecated, use generate_resource() instead", DeprecationWarning, ) return self.generate_resource_from_iterable( @@ -2894,6 +3021,103 @@ def generate_resource_from_iterator( encoding, ) + def download_generate_resource( + self, + downloader: BaseDownload, + url: str, + folder: str, + filename: str, + resourcedata: Dict, + header_insertions: Optional[ListTuple[Tuple[int, str]]] = None, + row_function: Optional[Callable[[List[str], Dict], Dict]] = None, + datecol: Optional[str] = None, + yearcol: Optional[str] = None, + date_function: Optional[Callable[[Dict], Optional[Dict]]] = None, + **kwargs: Any, + ) -> Tuple[bool, Dict]: + """Download url, write rows to csv and create resource, adding to it + the dataset. The returned dictionary will contain the resource in the + key resource, headers in the key headers and list of rows in the key + rows. + + Optionally, headers can be inserted at specific positions. This is + achieved using the header_insertions argument. If supplied, it is a + list of tuples of the form (position, header) to be inserted. A + function is called for each row. If supplied, it takes as arguments: + headers (prior to any insertions) and row (which will be in dict or + list form depending upon the dict_rows argument) and outputs a modified + row. + + The time period can optionally be set by supplying a column in + which the date or year is to be looked up. Note that any timezone + information is ignored and UTC assumed. Alternatively, a function can + be supplied to handle any dates in a row. It should accept a row and + should return None to ignore the row or a dictionary which can either + be empty if there are no dates in the row or can be populated with + keys startdate and/or enddate which are of type timezone-aware + datetime. The lowest start date and highest end date are used to set + the time period and are returned in the results dictionary in keys + startdate and enddate. + + If the parameter quickcharts is supplied then various QuickCharts + related actions will occur depending upon the keys given in the + dictionary and the returned dictionary will contain the QuickCharts + resource in the key qc_resource. If the keys: hashtag - the HXL hashtag + to examine - and values - the 3 values to look for in that column - are + supplied, then a list of booleans indicating which QuickCharts bites + should be enabled will be returned in the key bites_disabled in the + returned dictionary. For the 3 values, if the key: numeric_hashtag is + supplied then if that column for a given value contains no numbers, + then the corresponding bite will be disabled. If the key: cutdown is + given, if it is 1, then a separate cut down list is created containing + only columns with HXL hashtags and rows with desired values (if hashtag + and values are supplied) for the purpose of driving QuickCharts. It is + returned in the key qcrows in the returned dictionary with the matching + headers in qcheaders. If cutdown is 2, then a resource is created using + the cut down list. If the key cutdownhashtags is supplied, then only + the provided hashtags are used for cutting down otherwise the full list + of HXL tags is used. + + Args: + downloader (BaseDownload): A Download or Retrieve object + url (str): URL to download + hxltags (Dict[str,str]): Header to HXL hashtag mapping + folder (str): Folder to which to write file containing rows + filename (str): Filename of file to write rows + resourcedata (Dict): Resource data + header_insertions (Optional[ListTuple[Tuple[int,str]]]): List of (position, header) to insert. Defaults to None. + row_function (Optional[Callable[[List[str],Dict],Dict]]): Function to call for each row. Defaults to None. + datecol (Optional[str]): Date column for setting time period. Defaults to None (don't set). + yearcol (Optional[str]): Year column for setting dataset year range. Defaults to None (don't set). + date_function (Optional[Callable[[Dict],Optional[Dict]]]): Date function to call for each row. Defaults to None. + quickcharts (Optional[Dict]): Dictionary containing optional keys: hashtag, values, cutdown and/or cutdownhashtags + **kwargs: Any additional args to pass to downloader.get_tabular_rows + + Returns: + Tuple[bool, Dict]: (True if resource added, dictionary of results) + """ + headers, iterator = downloader.get_tabular_rows( + url, + dict_form=True, + header_insertions=header_insertions, + row_function=row_function, + format="csv", + **kwargs, + ) + return self.generate_resource_from_iterable( + headers, + iterator, + hxltags, + folder, + filename, + resourcedata, + datecol=datecol, + yearcol=yearcol, + date_function=date_function, + quickcharts=quickcharts, + encoding=kwargs.get("encoding", None), + ) + def download_and_generate_resource( self, downloader: BaseDownload, diff --git a/tests/fixtures/download_gen_resource/conflict_data_alg.csv b/tests/fixtures/download_gen_resource/conflict_data_alg.csv new file mode 100755 index 0000000..e1a7e59 --- /dev/null +++ b/tests/fixtures/download_gen_resource/conflict_data_alg.csv @@ -0,0 +1,6 @@ +lala,GWNO,EVENT_ID_CNTY,EVENT_ID_NO_CNTY,EVENT_DATE,YEAR,TIME_PRECISION,EVENT_TYPE,ACTOR1,ALLY_ACTOR_1,INTER1,ACTOR2,ALLY_ACTOR_2,INTER2,INTERACTION,COUNTRY,ADMIN1,ADMIN2,ADMIN3,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,NOTES,FATALITIES +,,#event+code,,#date+occurred,#date+year,,#event+type,#group+name+first,,,#group+name+second,,,,#country+name,#adm1+name,#adm2+name,#adm3+name,#loc+name,#geo+lat,#geo+lon,,#meta+source,#description,#affected+killed +lala,615,1416RTA,,18/04/2001,2001,1,Violence against civilians,Police Forces of Algeria (1999-),,1,Civilians (Algeria),Berber Ethnic Group (Algeria),7,17,Algeria,Tizi Ouzou,Beni-Douala,,Beni Douala,36.61954,4.08282,1,Associated Press Online,A Berber student was shot while in police custody at a police station in Beni Douala. He later died on Apr.21.,1 +lala,615,2229RTA,,19/04/2001,2001,1,Riots/Protests,Rioters (Algeria),Berber Ethnic Group (Algeria),5,Police Forces of Algeria (1999-),,1,15,Algeria,Tizi Ouzou,Tizi Ouzou,,Tizi Ouzou,36.71183,4.04591,3,Kabylie report,"Riots were reported in numerous villages in Kabylie, resulting in dozens wounded in clashes between protesters and police and significant material damage.",0 +lala,615,2230RTA,,20/04/2001,2002,1,Riots/Protests,Protesters (Algeria),Students (Algeria),6,,,0,60,Algeria,Bejaia,Amizour,,Amizour,36.64022,4.90131,1,Crisis Group,Students protested in the Amizour area. At least 3 were later arrested for allegedly insulting gendarmes., +lala,615,2231RTA,,21/04/2001,2001,1,Riots/Protests,Rioters (Algeria),Berber Ethnic Group (Algeria),5,Police Forces of Algeria (1999-),,1,15,Algeria,Bejaia,Amizour,,Amizour,36.64022,4.90131,1,Kabylie report,"Rioters threw molotov cocktails, rocks and burning tires at gendarmerie stations in Beni Douala, El-Kseur and Amizour.",0 diff --git a/tests/fixtures/gen_resource/min_qc_conflict_data_alg.csv b/tests/fixtures/download_gen_resource/min_qc_conflict_data_alg.csv similarity index 100% rename from tests/fixtures/gen_resource/min_qc_conflict_data_alg.csv rename to tests/fixtures/download_gen_resource/min_qc_conflict_data_alg.csv diff --git a/tests/fixtures/gen_resource/qc_conflict_data_alg.csv b/tests/fixtures/download_gen_resource/qc_conflict_data_alg.csv similarity index 100% rename from tests/fixtures/gen_resource/qc_conflict_data_alg.csv rename to tests/fixtures/download_gen_resource/qc_conflict_data_alg.csv diff --git a/tests/fixtures/gen_resource/test_data_no_data.csv b/tests/fixtures/download_gen_resource/test_data_no_data.csv similarity index 100% rename from tests/fixtures/gen_resource/test_data_no_data.csv rename to tests/fixtures/download_gen_resource/test_data_no_data.csv diff --git a/tests/fixtures/gen_resource/test_data_no_years.csv b/tests/fixtures/download_gen_resource/test_data_no_years.csv similarity index 100% rename from tests/fixtures/gen_resource/test_data_no_years.csv rename to tests/fixtures/download_gen_resource/test_data_no_years.csv diff --git a/tests/fixtures/gen_resource/conflict_data_alg.csv b/tests/fixtures/gen_resource/conflict_data_alg.csv index e1a7e59..44315d8 100755 --- a/tests/fixtures/gen_resource/conflict_data_alg.csv +++ b/tests/fixtures/gen_resource/conflict_data_alg.csv @@ -1,5 +1,4 @@ lala,GWNO,EVENT_ID_CNTY,EVENT_ID_NO_CNTY,EVENT_DATE,YEAR,TIME_PRECISION,EVENT_TYPE,ACTOR1,ALLY_ACTOR_1,INTER1,ACTOR2,ALLY_ACTOR_2,INTER2,INTERACTION,COUNTRY,ADMIN1,ADMIN2,ADMIN3,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,NOTES,FATALITIES -,,#event+code,,#date+occurred,#date+year,,#event+type,#group+name+first,,,#group+name+second,,,,#country+name,#adm1+name,#adm2+name,#adm3+name,#loc+name,#geo+lat,#geo+lon,,#meta+source,#description,#affected+killed lala,615,1416RTA,,18/04/2001,2001,1,Violence against civilians,Police Forces of Algeria (1999-),,1,Civilians (Algeria),Berber Ethnic Group (Algeria),7,17,Algeria,Tizi Ouzou,Beni-Douala,,Beni Douala,36.61954,4.08282,1,Associated Press Online,A Berber student was shot while in police custody at a police station in Beni Douala. He later died on Apr.21.,1 lala,615,2229RTA,,19/04/2001,2001,1,Riots/Protests,Rioters (Algeria),Berber Ethnic Group (Algeria),5,Police Forces of Algeria (1999-),,1,15,Algeria,Tizi Ouzou,Tizi Ouzou,,Tizi Ouzou,36.71183,4.04591,3,Kabylie report,"Riots were reported in numerous villages in Kabylie, resulting in dozens wounded in clashes between protesters and police and significant material damage.",0 lala,615,2230RTA,,20/04/2001,2002,1,Riots/Protests,Protesters (Algeria),Students (Algeria),6,,,0,60,Algeria,Bejaia,Amizour,,Amizour,36.64022,4.90131,1,Crisis Group,Students protested in the Amizour area. At least 3 were later arrested for allegedly insulting gendarmes., diff --git a/tests/hdx/data/test_dataset_noncore.py b/tests/hdx/data/test_dataset_noncore.py index 38624a0..88fbf50 100755 --- a/tests/hdx/data/test_dataset_noncore.py +++ b/tests/hdx/data/test_dataset_noncore.py @@ -909,7 +909,7 @@ def test_generate_resource_view( with pytest.raises(IOError): dataset.generate_quickcharts() - def test_remove_dates_from_title(self): + def test_remove_dates_from_title(self, configuration): dataset = Dataset() with pytest.raises(HDXError): dataset.remove_dates_from_title() diff --git a/tests/hdx/data/test_dataset_resource_generation.py b/tests/hdx/data/test_dataset_resource_generation.py index 4329346..29598c3 100644 --- a/tests/hdx/data/test_dataset_resource_generation.py +++ b/tests/hdx/data/test_dataset_resource_generation.py @@ -38,6 +38,549 @@ class TestDatasetResourceGeneration: "ISO3": "#country+code", } + def test_generate_resource(self, configuration): + with temp_dir("test") as folder: + filename = "conflict_data_alg.csv" + resourcedata = { + "name": "Conflict Data for Algeria", + "description": "Conflict data with HXL tags", + } + admin1s = set() + + def process_row(headers, row): + row["lala"] = "lala" + admin1 = row.get("ADMIN1") + if admin1 is not None: + admin1s.add(admin1) + return row + + dataset = Dataset() + with Download(user_agent="test") as downloader: + headers, iterator = downloader.get_tabular_rows( + TestDatasetResourceGeneration.url, + dict_form=True, + header_insertions=[(0, "lala")], + row_function=process_row, + format="csv", + ) + + success, results = dataset.generate_resource( + folder, + filename, + iterator, + resourcedata, + headers, + yearcol="YEAR", + ) + assert success is True + assert results == { + "startdate": datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc), + "enddate": datetime(2002, 12, 31, 23, 59, 59, tzinfo=timezone.utc), + "resource": { + "description": "Conflict data with HXL tags", + "format": "csv", + "name": "Conflict Data for Algeria", + }, + "headers": [ + "lala", + "GWNO", + "EVENT_ID_CNTY", + "EVENT_ID_NO_CNTY", + "EVENT_DATE", + "YEAR", + "TIME_PRECISION", + "EVENT_TYPE", + "ACTOR1", + "ALLY_ACTOR_1", + "INTER1", + "ACTOR2", + "ALLY_ACTOR_2", + "INTER2", + "INTERACTION", + "COUNTRY", + "ADMIN1", + "ADMIN2", + "ADMIN3", + "LOCATION", + "LATITUDE", + "LONGITUDE", + "GEO_PRECISION", + "SOURCE", + "NOTES", + "FATALITIES", + ], + "rows": [ + { + "GWNO": "615", + "EVENT_ID_CNTY": "1416RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "18/04/2001", + "YEAR": "2001", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Violence against civilians", + "ACTOR1": "Police Forces of Algeria (1999-)", + "ALLY_ACTOR_1": None, + "INTER1": "1", + "ACTOR2": "Civilians (Algeria)", + "ALLY_ACTOR_2": "Berber Ethnic Group (Algeria)", + "INTER2": "7", + "INTERACTION": "17", + "COUNTRY": "Algeria", + "ADMIN1": "Tizi Ouzou", + "ADMIN2": "Beni-Douala", + "ADMIN3": None, + "LOCATION": "Beni Douala", + "LATITUDE": "36.61954", + "LONGITUDE": "4.08282", + "GEO_PRECISION": "1", + "SOURCE": "Associated Press Online", + "NOTES": "A Berber student was shot while in police custody at a police station in Beni Douala. He later died on Apr.21.", + "FATALITIES": "1", + "lala": "lala", + }, + { + "GWNO": "615", + "EVENT_ID_CNTY": "2229RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "19/04/2001", + "YEAR": "2001", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Riots/Protests", + "ACTOR1": "Rioters (Algeria)", + "ALLY_ACTOR_1": "Berber Ethnic Group (Algeria)", + "INTER1": "5", + "ACTOR2": "Police Forces of Algeria (1999-)", + "ALLY_ACTOR_2": None, + "INTER2": "1", + "INTERACTION": "15", + "COUNTRY": "Algeria", + "ADMIN1": "Tizi Ouzou", + "ADMIN2": "Tizi Ouzou", + "ADMIN3": None, + "LOCATION": "Tizi Ouzou", + "LATITUDE": "36.71183", + "LONGITUDE": "4.04591", + "GEO_PRECISION": "3", + "SOURCE": "Kabylie report", + "NOTES": "Riots were reported in numerous villages in Kabylie, resulting in dozens wounded in clashes between protesters and police and significant material damage.", + "FATALITIES": "0", + "lala": "lala", + }, + { + "GWNO": "615", + "EVENT_ID_CNTY": "2230RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "20/04/2001", + "YEAR": "2002", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Riots/Protests", + "ACTOR1": "Protesters (Algeria)", + "ALLY_ACTOR_1": "Students (Algeria)", + "INTER1": "6", + "ACTOR2": None, + "ALLY_ACTOR_2": None, + "INTER2": "0", + "INTERACTION": "60", + "COUNTRY": "Algeria", + "ADMIN1": "Bejaia", + "ADMIN2": "Amizour", + "ADMIN3": None, + "LOCATION": "Amizour", + "LATITUDE": "36.64022", + "LONGITUDE": "4.90131", + "GEO_PRECISION": "1", + "SOURCE": "Crisis Group", + "NOTES": "Students protested in the Amizour area. At least 3 were later arrested for allegedly insulting gendarmes.", + "FATALITIES": None, + "lala": "lala", + }, + { + "GWNO": "615", + "EVENT_ID_CNTY": "2231RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "21/04/2001", + "YEAR": "2001", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Riots/Protests", + "ACTOR1": "Rioters (Algeria)", + "ALLY_ACTOR_1": "Berber Ethnic Group (Algeria)", + "INTER1": "5", + "ACTOR2": "Police Forces of Algeria (1999-)", + "ALLY_ACTOR_2": None, + "INTER2": "1", + "INTERACTION": "15", + "COUNTRY": "Algeria", + "ADMIN1": "Bejaia", + "ADMIN2": "Amizour", + "ADMIN3": None, + "LOCATION": "Amizour", + "LATITUDE": "36.64022", + "LONGITUDE": "4.90131", + "GEO_PRECISION": "1", + "SOURCE": "Kabylie report", + "NOTES": "Rioters threw molotov cocktails, rocks and burning tires at gendarmerie stations in Beni Douala, El-Kseur and Amizour.", + "FATALITIES": "0", + "lala": "lala", + }, + ], + } + assert ( + dataset["dataset_date"] + == "[2001-01-01T00:00:00 TO 2002-12-31T23:59:59]" + ) + assert admin1s == {"Bejaia", "Tizi Ouzou"} + resources = dataset.get_resources() + assert resources == [ + { + "name": "Conflict Data for Algeria", + "description": "Conflict data with HXL tags", + "format": "csv", + }, + ] + assert_files_same( + join("tests", "fixtures", "gen_resource", filename), + join(folder, filename), + ) + + success, results = dataset.download_and_generate_resource( + downloader, + TestDatasetResourceGeneration.url, + TestDatasetResourceGeneration.hxltags, + folder, + filename, + resourcedata, + header_insertions=[(0, "lala")], + row_function=process_row, + datecol="EVENT_DATE", + quickcharts=quickcharts, + ) + assert success is True + assert ( + dataset["dataset_date"] + == "[2001-04-18T00:00:00 TO 2001-04-21T23:59:59]" + ) + + quickcharts = { + "hashtag": "#event+code", + "values": ["1416RTA", "2230RTA", "2231RTA"], + "numeric_hashtag": "#affected+killed", + "cutdown": 2, + "cutdownhashtags": ["#event+code"], + } + success, results = dataset.download_and_generate_resource( + downloader, + TestDatasetResourceGeneration.url, + TestDatasetResourceGeneration.hxltags, + folder, + filename, + resourcedata, + header_insertions=[(0, "lala")], + row_function=process_row, + yearcol="YEAR", + quickcharts=quickcharts, + ) + assert success is True + assert results == { + "startdate": datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc), + "enddate": datetime(2002, 12, 31, 23, 59, 59, tzinfo=timezone.utc), + "bites_disabled": [False, True, False], + "resource": { + "description": "Conflict data with HXL tags", + "format": "csv", + "name": "Conflict Data for Algeria", + }, + "headers": [ + "lala", + "GWNO", + "EVENT_ID_CNTY", + "EVENT_ID_NO_CNTY", + "EVENT_DATE", + "YEAR", + "TIME_PRECISION", + "EVENT_TYPE", + "ACTOR1", + "ALLY_ACTOR_1", + "INTER1", + "ACTOR2", + "ALLY_ACTOR_2", + "INTER2", + "INTERACTION", + "COUNTRY", + "ADMIN1", + "ADMIN2", + "ADMIN3", + "LOCATION", + "LATITUDE", + "LONGITUDE", + "GEO_PRECISION", + "SOURCE", + "NOTES", + "FATALITIES", + ], + "rows": [ + { + "lala": "", + "GWNO": "", + "EVENT_ID_CNTY": "#event+code", + "EVENT_ID_NO_CNTY": "", + "EVENT_DATE": "#date+occurred", + "YEAR": "#date+year", + "TIME_PRECISION": "", + "EVENT_TYPE": "#event+type", + "ACTOR1": "#group+name+first", + "ALLY_ACTOR_1": "", + "INTER1": "", + "ACTOR2": "#group+name+second", + "ALLY_ACTOR_2": "", + "INTER2": "", + "INTERACTION": "", + "COUNTRY": "#country+name", + "ADMIN1": "#adm1+name", + "ADMIN2": "#adm2+name", + "ADMIN3": "#adm3+name", + "LOCATION": "#loc+name", + "LATITUDE": "#geo+lat", + "LONGITUDE": "#geo+lon", + "GEO_PRECISION": "", + "SOURCE": "#meta+source", + "NOTES": "#description", + "FATALITIES": "#affected+killed", + }, + { + "GWNO": "615", + "EVENT_ID_CNTY": "1416RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "18/04/2001", + "YEAR": "2001", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Violence against civilians", + "ACTOR1": "Police Forces of Algeria (1999-)", + "ALLY_ACTOR_1": None, + "INTER1": "1", + "ACTOR2": "Civilians (Algeria)", + "ALLY_ACTOR_2": "Berber Ethnic Group (Algeria)", + "INTER2": "7", + "INTERACTION": "17", + "COUNTRY": "Algeria", + "ADMIN1": "Tizi Ouzou", + "ADMIN2": "Beni-Douala", + "ADMIN3": None, + "LOCATION": "Beni Douala", + "LATITUDE": "36.61954", + "LONGITUDE": "4.08282", + "GEO_PRECISION": "1", + "SOURCE": "Associated Press Online", + "NOTES": "A Berber student was shot while in police custody at a police station in Beni Douala. He later died on Apr.21.", + "FATALITIES": "1", + "lala": "lala", + }, + { + "GWNO": "615", + "EVENT_ID_CNTY": "2229RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "19/04/2001", + "YEAR": "2001", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Riots/Protests", + "ACTOR1": "Rioters (Algeria)", + "ALLY_ACTOR_1": "Berber Ethnic Group (Algeria)", + "INTER1": "5", + "ACTOR2": "Police Forces of Algeria (1999-)", + "ALLY_ACTOR_2": None, + "INTER2": "1", + "INTERACTION": "15", + "COUNTRY": "Algeria", + "ADMIN1": "Tizi Ouzou", + "ADMIN2": "Tizi Ouzou", + "ADMIN3": None, + "LOCATION": "Tizi Ouzou", + "LATITUDE": "36.71183", + "LONGITUDE": "4.04591", + "GEO_PRECISION": "3", + "SOURCE": "Kabylie report", + "NOTES": "Riots were reported in numerous villages in Kabylie, resulting in dozens wounded in clashes between protesters and police and significant material damage.", + "FATALITIES": "0", + "lala": "lala", + }, + { + "GWNO": "615", + "EVENT_ID_CNTY": "2230RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "20/04/2001", + "YEAR": "2002", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Riots/Protests", + "ACTOR1": "Protesters (Algeria)", + "ALLY_ACTOR_1": "Students (Algeria)", + "INTER1": "6", + "ACTOR2": None, + "ALLY_ACTOR_2": None, + "INTER2": "0", + "INTERACTION": "60", + "COUNTRY": "Algeria", + "ADMIN1": "Bejaia", + "ADMIN2": "Amizour", + "ADMIN3": None, + "LOCATION": "Amizour", + "LATITUDE": "36.64022", + "LONGITUDE": "4.90131", + "GEO_PRECISION": "1", + "SOURCE": "Crisis Group", + "NOTES": "Students protested in the Amizour area. At least 3 were later arrested for allegedly insulting gendarmes.", + "FATALITIES": None, + "lala": "lala", + }, + { + "GWNO": "615", + "EVENT_ID_CNTY": "2231RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "21/04/2001", + "YEAR": "2001", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Riots/Protests", + "ACTOR1": "Rioters (Algeria)", + "ALLY_ACTOR_1": "Berber Ethnic Group (Algeria)", + "INTER1": "5", + "ACTOR2": "Police Forces of Algeria (1999-)", + "ALLY_ACTOR_2": None, + "INTER2": "1", + "INTERACTION": "15", + "COUNTRY": "Algeria", + "ADMIN1": "Bejaia", + "ADMIN2": "Amizour", + "ADMIN3": None, + "LOCATION": "Amizour", + "LATITUDE": "36.64022", + "LONGITUDE": "4.90131", + "GEO_PRECISION": "1", + "SOURCE": "Kabylie report", + "NOTES": "Rioters threw molotov cocktails, rocks and burning tires at gendarmerie stations in Beni Douala, El-Kseur and Amizour.", + "FATALITIES": "0", + "lala": "lala", + }, + ], + "qc_resource": { + "description": "Cut down data for QuickCharts", + "format": "csv", + "name": "QuickCharts-Conflict Data for Algeria", + }, + "qcheaders": ["EVENT_ID_CNTY", "FATALITIES"], + "qcrows": [ + { + "EVENT_ID_CNTY": "#event+code", + "FATALITIES": "#affected+killed", + }, + {"EVENT_ID_CNTY": "1416RTA", "FATALITIES": "1"}, + {"EVENT_ID_CNTY": "2231RTA", "FATALITIES": "0"}, + ], + } + + def process_year(row): + year = row["YEAR"] + if year == "2002": + return None + startdate, enddate = parse_date_range( + year, zero_time=True, max_endtime=True + ) + return {"startdate": startdate, "enddate": enddate} + + del quickcharts["hashtag"] + del quickcharts["numeric_hashtag"] + success, results = dataset.download_and_generate_resource( + downloader, + TestDatasetResourceGeneration.url, + TestDatasetResourceGeneration.hxltags, + folder, + filename, + resourcedata, + header_insertions=[(0, "lala")], + row_function=process_row, + date_function=process_year, + quickcharts=quickcharts, + ) + assert success is True + assert results["startdate"] == datetime( + 2001, 1, 1, 0, 0, tzinfo=timezone.utc + ) + assert results["enddate"] == datetime( + 2001, 12, 31, 23, 59, 59, tzinfo=timezone.utc + ) + assert ( + dataset["dataset_date"] + == "[2001-01-01T00:00:00 TO 2001-12-31T23:59:59]" + ) + assert_files_same( + join( + "tests", + "fixtures", + "gen_resource", + f"min_{qc_filename}", + ), + join(folder, qc_filename), + ) + + with pytest.raises(HDXError): + dataset.download_and_generate_resource( + downloader, + TestDatasetResourceGeneration.url, + TestDatasetResourceGeneration.hxltags, + folder, + filename, + resourcedata, + yearcol="YEAR", + date_function=process_year, + ) + success, results = dataset.download_and_generate_resource( + downloader, + TestDatasetResourceGeneration.url, + TestDatasetResourceGeneration.hxltags, + folder, + filename, + resourcedata, + header_insertions=[(0, "lala")], + row_function=process_row, + ) + assert success is True + url = "https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/main/tests/fixtures/empty.csv" + success, results = dataset.download_and_generate_resource( + downloader, + url, + TestDatasetResourceGeneration.hxltags, + folder, + filename, + resourcedata, + header_insertions=[(0, "lala")], + row_function=process_row, + yearcol="YEAR", + ) + assert success is False + url = "https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/main/tests/fixtures/gen_resource/test_data_no_data.csv" + success, results = dataset.download_and_generate_resource( + downloader, + url, + TestDatasetResourceGeneration.hxltags, + folder, + filename, + resourcedata, + header_insertions=[(0, "lala")], + row_function=process_row, + quickcharts=quickcharts, + ) + assert success is False + url = "https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/main/tests/fixtures/gen_resource/test_data_no_years.csv" + success, results = dataset.download_and_generate_resource( + downloader, + url, + TestDatasetResourceGeneration.hxltags, + folder, + filename, + resourcedata, + header_insertions=[(0, "lala")], + row_function=process_row, + yearcol="YEAR", + ) + assert success is False + def test_download_and_generate_resource(self, configuration): with temp_dir("test") as folder: filename = "conflict_data_alg.csv" @@ -352,7 +895,7 @@ def process_row(headers, row): }, ] assert_files_same( - join("tests", "fixtures", "gen_resource", filename), + join("tests", "fixtures", "download_gen_resource", filename), join(folder, filename), ) qc_filename = f"qc_{filename}" From 3eab990923b7ee388a26e91148af2445163de828 Mon Sep 17 00:00:00 2001 From: mcarans Date: Mon, 13 Oct 2025 13:45:25 +1300 Subject: [PATCH 4/6] Add download_generate_resource --- requirements.txt | 12 +-- src/hdx/data/dataset.py | 33 +++--- .../data/test_dataset_resource_generation.py | 102 +++--------------- 3 files changed, 39 insertions(+), 108 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2c1dc5e..aa74466 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ babel==2.17.0 # via mkdocs-material backrefs==5.9 # via mkdocs-material -cachetools==6.2.0 +cachetools==6.2.1 # via google-auth certifi==2025.10.5 # via requests @@ -66,7 +66,7 @@ gspread==6.2.1 # via hdx-python-api (pyproject.toml) hdx-python-country==3.9.8 # via hdx-python-api (pyproject.toml) -hdx-python-utilities==3.9.4 +hdx-python-utilities==3.9.5 # via # hdx-python-api (pyproject.toml) # hdx-python-country @@ -74,11 +74,11 @@ humanize==4.13.0 # via frictionless identify==2.6.15 # via pre-commit -idna==3.10 +idna==3.11 # via # email-validator # requests -ijson==3.4.0 +ijson==3.4.0.post0 # via hdx-python-utilities inflect==7.5.0 # via quantulum3 @@ -129,7 +129,7 @@ mergedeep==1.3.4 # via # mkdocs # mkdocs-get-deps -mkapi==4.4.5 +mkapi==4.5.0 # via hdx-python-api (pyproject.toml) mkdocs==1.6.1 # via @@ -313,7 +313,7 @@ urllib3==2.5.0 # requests validators==0.35.0 # via frictionless -virtualenv==20.34.0 +virtualenv==20.35.3 # via pre-commit watchdog==6.0.0 # via mkdocs diff --git a/src/hdx/data/dataset.py b/src/hdx/data/dataset.py index 251c602..6890674 100755 --- a/src/hdx/data/dataset.py +++ b/src/hdx/data/dataset.py @@ -3030,8 +3030,11 @@ def download_generate_resource( resourcedata: Dict, header_insertions: Optional[ListTuple[Tuple[int, str]]] = None, row_function: Optional[Callable[[List[str], Dict], Dict]] = None, - datecol: Optional[str] = None, - yearcol: Optional[str] = None, + columns: Union[ListTuple[int], ListTuple[str], None] = None, + format: str = "csv", + encoding: Optional[str] = None, + datecol: Optional[Union[int, str]] = None, + yearcol: Optional[Union[int, str]] = None, date_function: Optional[Callable[[Dict], Optional[Dict]]] = None, **kwargs: Any, ) -> Tuple[bool, Dict]: @@ -3087,10 +3090,12 @@ def download_generate_resource( resourcedata (Dict): Resource data header_insertions (Optional[ListTuple[Tuple[int,str]]]): List of (position, header) to insert. Defaults to None. row_function (Optional[Callable[[List[str],Dict],Dict]]): Function to call for each row. Defaults to None. - datecol (Optional[str]): Date column for setting time period. Defaults to None (don't set). - yearcol (Optional[str]): Year column for setting dataset year range. Defaults to None (don't set). - date_function (Optional[Callable[[Dict],Optional[Dict]]]): Date function to call for each row. Defaults to None. - quickcharts (Optional[Dict]): Dictionary containing optional keys: hashtag, values, cutdown and/or cutdownhashtags + columns (Union[ListTuple[int], ListTuple[str], None]): Columns to write. Defaults to all. + format (str): Format to write. Defaults to csv. + encoding (Optional[str]): Encoding to use. Defaults to None (infer encoding). + datecol: Optional[Union[int, str]] = None, + yearcol: Optional[Union[int, str]] = None, + date_function: Optional[Callable[[Dict], Optional[Dict]]] = None, **kwargs: Any additional args to pass to downloader.get_tabular_rows Returns: @@ -3104,18 +3109,18 @@ def download_generate_resource( format="csv", **kwargs, ) - return self.generate_resource_from_iterable( - headers, - iterator, - hxltags, + return self.generate_resource( folder, filename, + iterator, resourcedata, + headers, + columns=columns, + format=format, + encoding=encoding, datecol=datecol, yearcol=yearcol, date_function=date_function, - quickcharts=quickcharts, - encoding=kwargs.get("encoding", None), ) def download_and_generate_resource( @@ -3195,6 +3200,10 @@ def download_and_generate_resource( Returns: Tuple[bool, Dict]: (True if resource added, dictionary of results) """ + warnings.warn( + "download_and_generate_resource() is deprecated, use download_generate_resource() instead", + DeprecationWarning, + ) headers, iterator = downloader.get_tabular_rows( url, dict_form=True, diff --git a/tests/hdx/data/test_dataset_resource_generation.py b/tests/hdx/data/test_dataset_resource_generation.py index 29598c3..c88accb 100644 --- a/tests/hdx/data/test_dataset_resource_generation.py +++ b/tests/hdx/data/test_dataset_resource_generation.py @@ -38,7 +38,7 @@ class TestDatasetResourceGeneration: "ISO3": "#country+code", } - def test_generate_resource(self, configuration): + def test_download_generate_resource(self, configuration): with temp_dir("test") as folder: filename = "conflict_data_alg.csv" resourcedata = { @@ -56,20 +56,14 @@ def process_row(headers, row): dataset = Dataset() with Download(user_agent="test") as downloader: - headers, iterator = downloader.get_tabular_rows( + success, results = dataset.download_generate_resource( + downloader, TestDatasetResourceGeneration.url, - dict_form=True, - header_insertions=[(0, "lala")], - row_function=process_row, - format="csv", - ) - - success, results = dataset.generate_resource( folder, filename, - iterator, resourcedata, - headers, + header_insertions=[(0, "lala")], + row_function=process_row, yearcol="YEAR", ) assert success is True @@ -242,17 +236,15 @@ def process_row(headers, row): join(folder, filename), ) - success, results = dataset.download_and_generate_resource( + success, results = dataset.download_generate_resource( downloader, TestDatasetResourceGeneration.url, - TestDatasetResourceGeneration.hxltags, folder, filename, resourcedata, header_insertions=[(0, "lala")], row_function=process_row, datecol="EVENT_DATE", - quickcharts=quickcharts, ) assert success is True assert ( @@ -260,30 +252,20 @@ def process_row(headers, row): == "[2001-04-18T00:00:00 TO 2001-04-21T23:59:59]" ) - quickcharts = { - "hashtag": "#event+code", - "values": ["1416RTA", "2230RTA", "2231RTA"], - "numeric_hashtag": "#affected+killed", - "cutdown": 2, - "cutdownhashtags": ["#event+code"], - } - success, results = dataset.download_and_generate_resource( + success, results = dataset.download_generate_resource( downloader, TestDatasetResourceGeneration.url, - TestDatasetResourceGeneration.hxltags, folder, filename, resourcedata, header_insertions=[(0, "lala")], row_function=process_row, yearcol="YEAR", - quickcharts=quickcharts, ) assert success is True assert results == { "startdate": datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc), "enddate": datetime(2002, 12, 31, 23, 59, 59, tzinfo=timezone.utc), - "bites_disabled": [False, True, False], "resource": { "description": "Conflict data with HXL tags", "format": "csv", @@ -318,34 +300,6 @@ def process_row(headers, row): "FATALITIES", ], "rows": [ - { - "lala": "", - "GWNO": "", - "EVENT_ID_CNTY": "#event+code", - "EVENT_ID_NO_CNTY": "", - "EVENT_DATE": "#date+occurred", - "YEAR": "#date+year", - "TIME_PRECISION": "", - "EVENT_TYPE": "#event+type", - "ACTOR1": "#group+name+first", - "ALLY_ACTOR_1": "", - "INTER1": "", - "ACTOR2": "#group+name+second", - "ALLY_ACTOR_2": "", - "INTER2": "", - "INTERACTION": "", - "COUNTRY": "#country+name", - "ADMIN1": "#adm1+name", - "ADMIN2": "#adm2+name", - "ADMIN3": "#adm3+name", - "LOCATION": "#loc+name", - "LATITUDE": "#geo+lat", - "LONGITUDE": "#geo+lon", - "GEO_PRECISION": "", - "SOURCE": "#meta+source", - "NOTES": "#description", - "FATALITIES": "#affected+killed", - }, { "GWNO": "615", "EVENT_ID_CNTY": "1416RTA", @@ -459,20 +413,6 @@ def process_row(headers, row): "lala": "lala", }, ], - "qc_resource": { - "description": "Cut down data for QuickCharts", - "format": "csv", - "name": "QuickCharts-Conflict Data for Algeria", - }, - "qcheaders": ["EVENT_ID_CNTY", "FATALITIES"], - "qcrows": [ - { - "EVENT_ID_CNTY": "#event+code", - "FATALITIES": "#affected+killed", - }, - {"EVENT_ID_CNTY": "1416RTA", "FATALITIES": "1"}, - {"EVENT_ID_CNTY": "2231RTA", "FATALITIES": "0"}, - ], } def process_year(row): @@ -484,19 +424,15 @@ def process_year(row): ) return {"startdate": startdate, "enddate": enddate} - del quickcharts["hashtag"] - del quickcharts["numeric_hashtag"] - success, results = dataset.download_and_generate_resource( + success, results = dataset.download_generate_resource( downloader, TestDatasetResourceGeneration.url, - TestDatasetResourceGeneration.hxltags, folder, filename, resourcedata, header_insertions=[(0, "lala")], row_function=process_row, date_function=process_year, - quickcharts=quickcharts, ) assert success is True assert results["startdate"] == datetime( @@ -509,15 +445,6 @@ def process_year(row): dataset["dataset_date"] == "[2001-01-01T00:00:00 TO 2001-12-31T23:59:59]" ) - assert_files_same( - join( - "tests", - "fixtures", - "gen_resource", - f"min_{qc_filename}", - ), - join(folder, qc_filename), - ) with pytest.raises(HDXError): dataset.download_and_generate_resource( @@ -530,10 +457,9 @@ def process_year(row): yearcol="YEAR", date_function=process_year, ) - success, results = dataset.download_and_generate_resource( + success, results = dataset.download_generate_resource( downloader, TestDatasetResourceGeneration.url, - TestDatasetResourceGeneration.hxltags, folder, filename, resourcedata, @@ -542,10 +468,9 @@ def process_year(row): ) assert success is True url = "https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/main/tests/fixtures/empty.csv" - success, results = dataset.download_and_generate_resource( + success, results = dataset.download_generate_resource( downloader, url, - TestDatasetResourceGeneration.hxltags, folder, filename, resourcedata, @@ -555,23 +480,20 @@ def process_year(row): ) assert success is False url = "https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/main/tests/fixtures/gen_resource/test_data_no_data.csv" - success, results = dataset.download_and_generate_resource( + success, results = dataset.download_generate_resource( downloader, url, - TestDatasetResourceGeneration.hxltags, folder, filename, resourcedata, header_insertions=[(0, "lala")], row_function=process_row, - quickcharts=quickcharts, ) assert success is False url = "https://raw.githubusercontent.com/OCHA-DAP/hdx-python-api/main/tests/fixtures/gen_resource/test_data_no_years.csv" - success, results = dataset.download_and_generate_resource( + success, results = dataset.download_generate_resource( downloader, url, - TestDatasetResourceGeneration.hxltags, folder, filename, resourcedata, From b5f6c042c93551d2240fa1c6fed37ca156232146 Mon Sep 17 00:00:00 2001 From: mcarans Date: Mon, 13 Oct 2025 14:24:27 +1300 Subject: [PATCH 5/6] Document new methods and deprecations --- documentation/index.md | 67 ++------ src/hdx/data/dataset.py | 6 +- .../data/test_dataset_resource_generation.py | 157 ++++++++++-------- 3 files changed, 112 insertions(+), 118 deletions(-) diff --git a/documentation/index.md b/documentation/index.md index eeea5e8..1748ec1 100755 --- a/documentation/index.md +++ b/documentation/index.md @@ -54,7 +54,10 @@ The library has detailed API documentation which can be found in the menu at the ## Breaking Changes -From 6., remove unused `generate_qc_resource_from_rows` method. +From 6.5.2, remove unused `generate_qc_resource_from_rows` method. +`generate_resource_from_rows`, `generate_resource_from_iterable` and +`download_and_generate_resource` are deprecated. They are replaced by +`generate_resource` and `download_generate_resource`. From 6.5.0, files will not be uploaded to the HDX filestore if the hash and size have not changed, but if there are any resource metadata changes, except for last_modified, @@ -823,16 +826,14 @@ dictionary. HEADERS is either a row number (rows start counting at 1), or the actual headers defined as a list of strings. If not set, all rows will be treated as containing values: - dataset.generate_resource_from_rows("FOLDER", "FILENAME", ROWS, - RESOURCE DATA, HEADERS, "ENCODING") + dataset.generate_resource("FOLDER", "FILENAME", ROWS, RESOURCE DATA, HEADERS, + COLUMNS, "FORMAT", "ENCODING", DATECOL or YEARCOL or + DATE_FUNCTION) -Building on these basic resource generation methods, there are more powerful -ones `generate_resource_from_iterator` and `download_and_generate_resource`. - -A resource can be generated from a given list or tuple: HEADERS and an ITERATOR -which can return rows in list, tuple or dictionary form. A mapping from headers -to HXL hashtags, HXLTAGS, must be provided along with the FOLDER and FILENAME -where the file will be generated for upload to the filestore. The dataset +The first 4 parameters are mandatory, the rest are optional. A resource can be generated +from a given list or tuple or other iterable. The method returns a tuple with a bool +True is the resource was addeed and a dictionary of information. FOLDER and FILENAME +specify where the file will be generated for upload to the filestore. The dataset time period can optionally be set by supplying DATECOL for looking up dates or YEARCOL for looking up years. DATECOl and YEARCOL can be a column name or the index of a column. Note that any timezone information is ignored and UTC @@ -846,40 +847,9 @@ datetime. The lowest start date and highest end date are used to set the time period and are returned in the results dictionary in keys startdate and enddate. - dataset.generate_resource_from_iterator(HEADERS, ITERATOR, HXLTAGS, - "FOLDER", "FILENAME", RESOURCE_DATA, DATECOL or YEARCOL or DATE_FUNCTION, - QUICKCHARTS, "ENCODING") - -If desired, `generate_resource_from_iterator` can generate a separate -QuickCharts resource designed to be used in a time series QuickCharts bite -provided that the input has #indicator+code, #date and #indicator+value+num. -This is achieved by supplying the parameter QUICKCHARTS which activates various -QuickCharts related actions depending upon the keys given in the dictionary. -The returned dictionary will contain the QuickCharts resource in the key -qc_resource. If the keys: hashtag - the HXL hashtag to examine - and values - -the 3 values to look for in that column - are supplied, then a list of booleans -indicating which QuickCharts bites should be enabled will be returned in the -key bites_disabled in the returned dictionary. For the 3 values, if the key: -numeric_hashtag is supplied then if that column for a given value contains no -numbers, then the corresponding bite will be disabled. If the key: cutdown is -given, if it is 1, then a separate cut down list is created containing only -columns with HXL hashtags and rows with desired values (if hashtag and values -are supplied) for the purpose of driving QuickCharts. It is returned in the key -qcrows in the returned dictionary with the matching headers in qcheaders. If -cutdown is 2, then a resource is created using the cut down list. If the key -cutdownhashtags is supplied, then only the provided hashtags are used for -cutting down otherwise the full list of HXL tags is used. - -The QuickCharts resource will be of form similar to below: - - GHO (CODE),ENDYEAR,Numeric - #indicator+code,#date+year+end,#indicator+value+num - VIOLENCE_HOMICIDERATE,1994,123.4 - MDG_0000000001,2015,123.4 - -`download_and_generate_resource` builds on `generate_resource_from_iterator`. -It uses an DOWNLOADER, an object of class `Download`, `Retrieve` or other class -that implements `BaseDownload` to download from URL. Additional arguments in +`download_generate_resource` builds on `generate_resource`. +It uses a DOWNLOADER, an object of class `Download`, `Retrieve` or other class +that implements `BaseDownload` to download from a URL. Additional arguments in **KWARGS are passed to the `get_tabular_rows` method of the DOWNLOADER. Optionally, headers can be inserted at specific positions. This is achieved @@ -889,12 +859,11 @@ row. If supplied, it takes as arguments: headers (prior to any insertions) and row (which will be in dict or list form depending upon the dict_rows argument) and outputs a modified row. -The rest of the arguments are the same as for -`generate_resource_from_iterator`. +The rest of the arguments are the same as for `generate_resource`. - dataset.download_and_generate_resource(DOWNLOADER, "URL", HXLTAGS, - "FOLDER", "FILENAME", RESOURCE_DATA, HEADER_INSERTIONS, ROW_FUNCTION, - DATECOL or YEARCOL or DATE_FUNCTION, QUICKCHARTS, **KWARGS) + dataset.download_generate_resource(DOWNLOADER, "URL", "FOLDER", "FILENAME", + RESOURCE_DATA, HEADER_INSERTIONS, ROW_FUNCTION, + DATECOL or YEARCOL or DATE_FUNCTION, **KWARGS) ### QuickCharts Generation diff --git a/src/hdx/data/dataset.py b/src/hdx/data/dataset.py index 6890674..068c41c 100755 --- a/src/hdx/data/dataset.py +++ b/src/hdx/data/dataset.py @@ -2739,7 +2739,11 @@ def process_row(row: ListTupleDict) -> Optional[ListTupleDict]: resource.set_file_to_upload(filepath) self.add_update_resource(resource) retdict["resource"] = resource - retdict["headers"] = headers + if columns is not None: + retdict["headers"] = columns + retdict["original_headers"] = headers + else: + retdict["headers"] = headers retdict["rows"] = rows return True, retdict diff --git a/tests/hdx/data/test_dataset_resource_generation.py b/tests/hdx/data/test_dataset_resource_generation.py index c88accb..5dca7fa 100644 --- a/tests/hdx/data/test_dataset_resource_generation.py +++ b/tests/hdx/data/test_dataset_resource_generation.py @@ -43,7 +43,7 @@ def test_download_generate_resource(self, configuration): filename = "conflict_data_alg.csv" resourcedata = { "name": "Conflict Data for Algeria", - "description": "Conflict data with HXL tags", + "description": "Conflict data", } admin1s = set() @@ -66,45 +66,47 @@ def process_row(headers, row): row_function=process_row, yearcol="YEAR", ) + expected_headers = [ + "lala", + "GWNO", + "EVENT_ID_CNTY", + "EVENT_ID_NO_CNTY", + "EVENT_DATE", + "YEAR", + "TIME_PRECISION", + "EVENT_TYPE", + "ACTOR1", + "ALLY_ACTOR_1", + "INTER1", + "ACTOR2", + "ALLY_ACTOR_2", + "INTER2", + "INTERACTION", + "COUNTRY", + "ADMIN1", + "ADMIN2", + "ADMIN3", + "LOCATION", + "LATITUDE", + "LONGITUDE", + "GEO_PRECISION", + "SOURCE", + "NOTES", + "FATALITIES", + ] assert success is True assert results == { "startdate": datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc), "enddate": datetime(2002, 12, 31, 23, 59, 59, tzinfo=timezone.utc), "resource": { - "description": "Conflict data with HXL tags", + "description": "Conflict data", "format": "csv", "name": "Conflict Data for Algeria", }, - "headers": [ - "lala", - "GWNO", - "EVENT_ID_CNTY", - "EVENT_ID_NO_CNTY", - "EVENT_DATE", - "YEAR", - "TIME_PRECISION", - "EVENT_TYPE", - "ACTOR1", - "ALLY_ACTOR_1", - "INTER1", - "ACTOR2", - "ALLY_ACTOR_2", - "INTER2", - "INTERACTION", - "COUNTRY", - "ADMIN1", - "ADMIN2", - "ADMIN3", - "LOCATION", - "LATITUDE", - "LONGITUDE", - "GEO_PRECISION", - "SOURCE", - "NOTES", - "FATALITIES", - ], + "headers": expected_headers, "rows": [ { + "lala": "lala", "GWNO": "615", "EVENT_ID_CNTY": "1416RTA", "EVENT_ID_NO_CNTY": None, @@ -130,9 +132,9 @@ def process_row(headers, row): "SOURCE": "Associated Press Online", "NOTES": "A Berber student was shot while in police custody at a police station in Beni Douala. He later died on Apr.21.", "FATALITIES": "1", - "lala": "lala", }, { + "lala": "lala", "GWNO": "615", "EVENT_ID_CNTY": "2229RTA", "EVENT_ID_NO_CNTY": None, @@ -158,9 +160,9 @@ def process_row(headers, row): "SOURCE": "Kabylie report", "NOTES": "Riots were reported in numerous villages in Kabylie, resulting in dozens wounded in clashes between protesters and police and significant material damage.", "FATALITIES": "0", - "lala": "lala", }, { + "lala": "lala", "GWNO": "615", "EVENT_ID_CNTY": "2230RTA", "EVENT_ID_NO_CNTY": None, @@ -186,9 +188,9 @@ def process_row(headers, row): "SOURCE": "Crisis Group", "NOTES": "Students protested in the Amizour area. At least 3 were later arrested for allegedly insulting gendarmes.", "FATALITIES": None, - "lala": "lala", }, { + "lala": "lala", "GWNO": "615", "EVENT_ID_CNTY": "2231RTA", "EVENT_ID_NO_CNTY": None, @@ -214,7 +216,6 @@ def process_row(headers, row): "SOURCE": "Kabylie report", "NOTES": "Rioters threw molotov cocktails, rocks and burning tires at gendarmerie stations in Beni Douala, El-Kseur and Amizour.", "FATALITIES": "0", - "lala": "lala", }, ], } @@ -227,7 +228,7 @@ def process_row(headers, row): assert resources == [ { "name": "Conflict Data for Algeria", - "description": "Conflict data with HXL tags", + "description": "Conflict data", "format": "csv", }, ] @@ -236,12 +237,35 @@ def process_row(headers, row): join(folder, filename), ) + columns_to_include = [ + "lala", + "GWNO", + "EVENT_ID_CNTY", + "EVENT_ID_NO_CNTY", + "EVENT_DATE", + "YEAR", + "TIME_PRECISION", + "EVENT_TYPE", + "ACTOR1", + "ALLY_ACTOR_1", + "INTER1", + "ACTOR2", + "ALLY_ACTOR_2", + "INTER2", + "INTERACTION", + "COUNTRY", + "ADMIN1", + "ADMIN2", + "ADMIN3", + "FATALITIES", + ] success, results = dataset.download_generate_resource( downloader, TestDatasetResourceGeneration.url, folder, filename, resourcedata, + columns=columns_to_include, header_insertions=[(0, "lala")], row_function=process_row, datecol="EVENT_DATE", @@ -251,6 +275,30 @@ def process_row(headers, row): dataset["dataset_date"] == "[2001-04-18T00:00:00 TO 2001-04-21T23:59:59]" ) + assert results["headers"] == columns_to_include + assert results["original_headers"] == expected_headers + assert results["rows"][0] == { + "lala": "lala", + "GWNO": "615", + "EVENT_ID_CNTY": "1416RTA", + "EVENT_ID_NO_CNTY": None, + "EVENT_DATE": "18/04/2001", + "YEAR": "2001", + "TIME_PRECISION": "1", + "EVENT_TYPE": "Violence against civilians", + "ACTOR1": "Police Forces of Algeria (1999-)", + "ALLY_ACTOR_1": None, + "INTER1": "1", + "ACTOR2": "Civilians (Algeria)", + "ALLY_ACTOR_2": "Berber Ethnic Group (Algeria)", + "INTER2": "7", + "INTERACTION": "17", + "COUNTRY": "Algeria", + "ADMIN1": "Tizi Ouzou", + "ADMIN2": "Beni-Douala", + "ADMIN3": None, + "FATALITIES": "1", + } success, results = dataset.download_generate_resource( downloader, @@ -267,40 +315,14 @@ def process_row(headers, row): "startdate": datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc), "enddate": datetime(2002, 12, 31, 23, 59, 59, tzinfo=timezone.utc), "resource": { - "description": "Conflict data with HXL tags", + "description": "Conflict data", "format": "csv", "name": "Conflict Data for Algeria", }, - "headers": [ - "lala", - "GWNO", - "EVENT_ID_CNTY", - "EVENT_ID_NO_CNTY", - "EVENT_DATE", - "YEAR", - "TIME_PRECISION", - "EVENT_TYPE", - "ACTOR1", - "ALLY_ACTOR_1", - "INTER1", - "ACTOR2", - "ALLY_ACTOR_2", - "INTER2", - "INTERACTION", - "COUNTRY", - "ADMIN1", - "ADMIN2", - "ADMIN3", - "LOCATION", - "LATITUDE", - "LONGITUDE", - "GEO_PRECISION", - "SOURCE", - "NOTES", - "FATALITIES", - ], + "headers": expected_headers, "rows": [ { + "lala": "lala", "GWNO": "615", "EVENT_ID_CNTY": "1416RTA", "EVENT_ID_NO_CNTY": None, @@ -326,9 +348,9 @@ def process_row(headers, row): "SOURCE": "Associated Press Online", "NOTES": "A Berber student was shot while in police custody at a police station in Beni Douala. He later died on Apr.21.", "FATALITIES": "1", - "lala": "lala", }, { + "lala": "lala", "GWNO": "615", "EVENT_ID_CNTY": "2229RTA", "EVENT_ID_NO_CNTY": None, @@ -354,9 +376,9 @@ def process_row(headers, row): "SOURCE": "Kabylie report", "NOTES": "Riots were reported in numerous villages in Kabylie, resulting in dozens wounded in clashes between protesters and police and significant material damage.", "FATALITIES": "0", - "lala": "lala", }, { + "lala": "lala", "GWNO": "615", "EVENT_ID_CNTY": "2230RTA", "EVENT_ID_NO_CNTY": None, @@ -382,9 +404,9 @@ def process_row(headers, row): "SOURCE": "Crisis Group", "NOTES": "Students protested in the Amizour area. At least 3 were later arrested for allegedly insulting gendarmes.", "FATALITIES": None, - "lala": "lala", }, { + "lala": "lala", "GWNO": "615", "EVENT_ID_CNTY": "2231RTA", "EVENT_ID_NO_CNTY": None, @@ -410,7 +432,6 @@ def process_row(headers, row): "SOURCE": "Kabylie report", "NOTES": "Rioters threw molotov cocktails, rocks and burning tires at gendarmerie stations in Beni Douala, El-Kseur and Amizour.", "FATALITIES": "0", - "lala": "lala", }, ], } From 53b2e899665d370e01b2770cee914638ecc13e91 Mon Sep 17 00:00:00 2001 From: mcarans Date: Mon, 13 Oct 2025 14:43:38 +1300 Subject: [PATCH 6/6] Fix tests --- tests/fixtures/gen_resource/min_qc_conflict_data_alg.csv | 5 +++++ tests/fixtures/gen_resource/qc_conflict_data_alg.csv | 4 ++++ tests/fixtures/gen_resource/test_data_no_data.csv | 1 + tests/fixtures/gen_resource/test_data_no_years.csv | 5 +++++ 4 files changed, 15 insertions(+) create mode 100644 tests/fixtures/gen_resource/min_qc_conflict_data_alg.csv create mode 100644 tests/fixtures/gen_resource/qc_conflict_data_alg.csv create mode 100644 tests/fixtures/gen_resource/test_data_no_data.csv create mode 100644 tests/fixtures/gen_resource/test_data_no_years.csv diff --git a/tests/fixtures/gen_resource/min_qc_conflict_data_alg.csv b/tests/fixtures/gen_resource/min_qc_conflict_data_alg.csv new file mode 100644 index 0000000..239a574 --- /dev/null +++ b/tests/fixtures/gen_resource/min_qc_conflict_data_alg.csv @@ -0,0 +1,5 @@ +EVENT_ID_CNTY +#event+code +1416RTA +2229RTA +2231RTA diff --git a/tests/fixtures/gen_resource/qc_conflict_data_alg.csv b/tests/fixtures/gen_resource/qc_conflict_data_alg.csv new file mode 100644 index 0000000..fa22a0d --- /dev/null +++ b/tests/fixtures/gen_resource/qc_conflict_data_alg.csv @@ -0,0 +1,4 @@ +EVENT_ID_CNTY,EVENT_DATE,YEAR,EVENT_TYPE,ACTOR1,ACTOR2,COUNTRY,ADMIN1,ADMIN2,ADMIN3,LOCATION,LATITUDE,LONGITUDE,SOURCE,NOTES,FATALITIES +#event+code,#date+occurred,#date+year,#event+type,#group+name+first,#group+name+second,#country+name,#adm1+name,#adm2+name,#adm3+name,#loc+name,#geo+lat,#geo+lon,#meta+source,#description,#affected+killed +1416RTA,18/04/2001,2001,Violence against civilians,Police Forces of Algeria (1999-),Civilians (Algeria),Algeria,Tizi Ouzou,Beni-Douala,,Beni Douala,36.61954,4.08282,Associated Press Online,A Berber student was shot while in police custody at a police station in Beni Douala. He later died on Apr.21.,1 +2231RTA,21/04/2001,2001,Riots/Protests,Rioters (Algeria),Police Forces of Algeria (1999-),Algeria,Bejaia,Amizour,,Amizour,36.64022,4.90131,Kabylie report,"Rioters threw molotov cocktails, rocks and burning tires at gendarmerie stations in Beni Douala, El-Kseur and Amizour.",0 diff --git a/tests/fixtures/gen_resource/test_data_no_data.csv b/tests/fixtures/gen_resource/test_data_no_data.csv new file mode 100644 index 0000000..60f0622 --- /dev/null +++ b/tests/fixtures/gen_resource/test_data_no_data.csv @@ -0,0 +1 @@ +GWNO,EVENT_ID_CNTY,EVENT_ID_NO_CNTY,EVENT_DATE,YEAR,TIME_PRECISION,EVENT_TYPE,ACTOR1,ALLY_ACTOR_1,INTER1,ACTOR2,ALLY_ACTOR_2,INTER2,INTERACTION,COUNTRY,ADMIN1,ADMIN2,ADMIN3,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,NOTES,FATALITIES diff --git a/tests/fixtures/gen_resource/test_data_no_years.csv b/tests/fixtures/gen_resource/test_data_no_years.csv new file mode 100644 index 0000000..f6c72e7 --- /dev/null +++ b/tests/fixtures/gen_resource/test_data_no_years.csv @@ -0,0 +1,5 @@ +GWNO,EVENT_ID_CNTY,EVENT_ID_NO_CNTY,EVENT_DATE,YEAR,TIME_PRECISION,EVENT_TYPE,ACTOR1,ALLY_ACTOR_1,INTER1,ACTOR2,ALLY_ACTOR_2,INTER2,INTERACTION,COUNTRY,ADMIN1,ADMIN2,ADMIN3,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,NOTES,FATALITIES +615,1416RTA,,18/04/2001,,1,Violence against civilians,Police Forces of Algeria (1999-),,1,Civilians (Algeria),Berber Ethnic Group (Algeria),7,17,Algeria,Tizi Ouzou,Beni-Douala,,Beni Douala,36.61954,4.08282,1,Associated Press Online,A Berber student was shot while in police custody at a police station in Beni Douala. He later died on Apr.21.,1 +615,2229RTA,,19/04/2001,,1,Riots/Protests,Rioters (Algeria),Berber Ethnic Group (Algeria),5,Police Forces of Algeria (1999-),,1,15,Algeria,Tizi Ouzou,Tizi Ouzou,,Tizi Ouzou,36.71183,4.04591,3,Kabylie report,"Riots were reported in numerous villages in Kabylie, resulting in dozens wounded in clashes between protesters and police and significant material damage.",0 +615,2230RTA,,20/04/2001,,1,Riots/Protests,Protesters (Algeria),Students (Algeria),6,,,0,60,Algeria,Bejaia,Amizour,,Amizour,36.64022,4.90131,1,Crisis Group,Students protested in the Amizour area. At least 3 were later arrested for allegedly insulting gendarmes.,0 +615,2231RTA,,21/04/2001,,1,Riots/Protests,Rioters (Algeria),Berber Ethnic Group (Algeria),5,Police Forces of Algeria (1999-),,1,15,Algeria,Bejaia,Amizour,,Amizour,36.64022,4.90131,1,Kabylie report,"Rioters threw molotov cocktails, rocks and burning tires at gendarmerie stations in Beni Douala, El-Kseur and Amizour.",0