diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 2ee634e1..60375c1a 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -16,13 +16,13 @@ jobs: id-token: write # IMPORTANT: mandatory for trusted publishing steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Get history and tags for versioning to work run: | git fetch --prune --unshallow git fetch --depth=1 origin +refs/tags/*:refs/tags/* - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install dependencies diff --git a/.github/workflows/run-python-tests.yaml b/.github/workflows/run-python-tests.yaml index 4a614c09..52270393 100644 --- a/.github/workflows/run-python-tests.yaml +++ b/.github/workflows/run-python-tests.yaml @@ -26,9 +26,9 @@ jobs: pull-requests: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' - name: Install dependencies diff --git a/documentation/index.md b/documentation/index.md index 1748ec18..98695f55 100755 --- a/documentation/index.md +++ b/documentation/index.md @@ -899,6 +899,13 @@ The built-in configuration assumes data will be of form similar to below: ## Resource Specific Operations +When creating or updating a resource that doesn't have an id, if you supply a parameter +dataset, then the resource will be assigned to that dataset and it will be compared to +resources in that dataset. If a match is found, then the resource will be given the +corresponding id and that resource on HDX will be overwritten. + + resource.create_in_hdx(dataset=DATASET) + You can download a resource using the **download** function eg. url, path = resource.download("FOLDER_TO_DOWNLOAD_TO") diff --git a/pyproject.toml b/pyproject.toml index 429936f7..aad7e570 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "defopt>=7.0.0", "email_validator", "hdx-python-country>=3.9.8", - "hdx-python-utilities>=3.9.5", + "hdx-python-utilities>=3.9.6", "libhxl>=5.2.2", "makefun", "quantulum3", diff --git a/requirements.txt b/requirements.txt index 1bd94ded..c830b288 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,11 +14,11 @@ babel==2.17.0 # via mkdocs-material backrefs==6.1 # via mkdocs-material -cachetools==6.2.2 +cachetools==6.2.4 # via google-auth certifi==2025.11.12 # via requests -cfgv==3.4.0 +cfgv==3.5.0 # via pre-commit chardet==5.2.0 # via frictionless @@ -32,7 +32,7 @@ click==8.3.1 # typer colorama==0.4.6 # via mkdocs-material -coverage==7.12.0 +coverage==7.13.0 # via pytest-cov defopt==7.0.0 # via hdx-python-api (pyproject.toml) @@ -50,13 +50,13 @@ email-validator==2.3.0 # via hdx-python-api (pyproject.toml) et-xmlfile==2.0.0 # via openpyxl -filelock==3.20.0 +filelock==3.20.1 # via virtualenv frictionless==5.18.1 # via hdx-python-utilities ghp-import==2.1.0 # via mkdocs -google-auth==2.43.0 +google-auth==2.45.0 # via # google-auth-oauthlib # gspread @@ -66,7 +66,7 @@ gspread==6.2.1 # via hdx-python-api (pyproject.toml) hdx-python-country==3.9.8 # via hdx-python-api (pyproject.toml) -hdx-python-utilities==3.9.5 +hdx-python-utilities==3.9.6 # via # hdx-python-api (pyproject.toml) # hdx-python-country @@ -161,7 +161,7 @@ pathspec==0.12.1 # via mkdocs petl==1.7.17 # via frictionless -platformdirs==4.5.0 +platformdirs==4.5.1 # via # mkdocs-get-deps # virtualenv @@ -175,7 +175,7 @@ ply==3.11 # libhxl pockets==0.9.1 # via sphinxcontrib-napoleon -pre-commit==4.4.0 +pre-commit==4.5.1 # via hdx-python-api (pyproject.toml) pyasn1==0.6.1 # via @@ -183,7 +183,7 @@ pyasn1==0.6.1 # rsa pyasn1-modules==0.4.2 # via google-auth -pydantic==2.12.4 +pydantic==2.12.5 # via frictionless pydantic-core==2.41.5 # via pydantic @@ -192,16 +192,16 @@ pygments==2.19.2 # mkdocs-material # pytest # rich -pymdown-extensions==10.17.1 +pymdown-extensions==10.19.1 # via mkdocs-material pyphonetics==0.5.3 # via hdx-python-utilities -pytest==9.0.1 +pytest==9.0.2 # via # hdx-python-api (pyproject.toml) # pytest-check # pytest-cov -pytest-check==2.6.0 +pytest-check==2.6.2 # via hdx-python-api (pyproject.toml) pytest-cov==7.0.0 # via hdx-python-api (pyproject.toml) @@ -253,7 +253,7 @@ rfc3986==2.0.0 # via frictionless rich==14.2.0 # via typer -rpds-py==0.29.0 +rpds-py==0.30.0 # via # jsonschema # referencing @@ -307,7 +307,7 @@ unidecode==1.4.0 # via # libhxl # pyphonetics -urllib3==2.5.0 +urllib3==2.6.2 # via # libhxl # requests diff --git a/src/hdx/data/dataset.py b/src/hdx/data/dataset.py index 068c41c4..c920d432 100755 --- a/src/hdx/data/dataset.py +++ b/src/hdx/data/dataset.py @@ -278,7 +278,7 @@ def add_update_resource( self, resource: Union["Resource", Dict, str], ignore_datasetid: bool = False, - ) -> None: + ) -> "Resource": """Add new or update existing resource in dataset with new metadata Args: @@ -286,7 +286,7 @@ def add_update_resource( ignore_datasetid (bool): Whether to ignore dataset id in the resource Returns: - None + Resource: The resource that was added after matching with any existing resource """ resource = self._get_resource_from_obj(resource) if "package_id" in resource: @@ -298,14 +298,15 @@ def add_update_resource( resource_index = ResourceMatcher.match_resource_list(self._resources, resource) if resource_index is None: self._resources.append(resource) - else: - updated_resource = merge_two_dictionaries( - self._resources[resource_index], resource - ) - if resource.get_file_to_upload(): - updated_resource.set_file_to_upload(resource.get_file_to_upload()) - if resource.is_marked_data_updated(): - updated_resource.mark_data_updated() + return resource + updated_resource = merge_two_dictionaries( + self._resources[resource_index], resource + ) + if resource.get_file_to_upload(): + updated_resource.set_file_to_upload(resource.get_file_to_upload()) + if resource.is_marked_data_updated(): + updated_resource.mark_data_updated() + return updated_resource def add_update_resources( self, diff --git a/src/hdx/data/resource.py b/src/hdx/data/resource.py index 67b80c18..387d1827 100755 --- a/src/hdx/data/resource.py +++ b/src/hdx/data/resource.py @@ -8,6 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import hdx.data.dataset +import hdx.data.resource_matcher from hdx.api.configuration import Configuration from hdx.api.utilities.date_helper import DateHelper from hdx.api.utilities.size_hash import get_size_and_hash @@ -460,6 +461,39 @@ def _resource_merge_hdx_update( self._merge_hdx_update("resource", "id", files, True, **kwargs) return status + def _get_resource_id(self, **kwargs: Any) -> Optional[str]: + """Helper function to get resource id if available from given resource or by + comparing ot a given dataset's resources. + + Args: + **kwargs: See below + dataset (Dataset): Existing dataset if available to obtain resource id + + Returns: + Optional[str]: Resource id or None + """ + loadedid = self.data.get("id") + if loadedid is None: + dataset = kwargs.get("dataset") + if dataset: + dataset_id = dataset.get("id") + if dataset_id: + existing_dataset_id = self.data.get("package_id") + if not existing_dataset_id or existing_dataset_id == dataset_id: + self.data["package_id"] = dataset["id"] + dataset_resources = dataset.get_resources() + matching_index = hdx.data.resource_matcher.ResourceMatcher.match_resource_list( + dataset_resources, self + ) + if matching_index: + matching_resource = dataset_resources[matching_index] + loadedid = matching_resource.get("id") + if loadedid: + self.data["id"] = loadedid + else: + loadedid = None + return loadedid + def update_in_hdx(self, **kwargs: Any) -> int: """Check if resource exists in HDX and if so, update it. To indicate that the data in an external resource (given by a URL) has been @@ -482,11 +516,13 @@ def update_in_hdx(self, **kwargs: Any) -> int: data_updated (bool): If True, set last_modified to now. Defaults to False. date_data_updated (datetime): Date to use for last_modified. Default to None. force_update (bool): Force file to be updated even if it hasn't changed. Defaults to False. + dataset (Dataset): Existing dataset if available to obtain resource id Returns: int: Status code """ self.check_both_url_filetoupload() + _ = self._get_resource_id(**kwargs) self._check_load_existing_object("resource", "id") return self._resource_merge_hdx_update(**kwargs) @@ -513,15 +549,18 @@ def create_in_hdx(self, **kwargs: Any) -> int: data_updated (bool): If True, set last_modified to now. Defaults to False. date_data_updated (datetime): Date to use for last_modified. Default to None. force_update (bool): Force file to be updated even if it hasn't changed. Defaults to False. + dataset (Dataset): Existing dataset if available to obtain resource id Returns: int: Status code """ self.check_both_url_filetoupload() - id = self.data.get("id") - if id and self._load_from_hdx("resource", id): - logger.warning(f"{'resource'} exists. Updating {id}") - return self._resource_merge_hdx_update(**kwargs) + loadedid = self._get_resource_id(**kwargs) + if loadedid: + if self._load_from_hdx("resource", loadedid): + logger.warning(f"{'resource'} exists. Updating {loadedid}") + return self._resource_merge_hdx_update(**kwargs) + logger.warning(f"Failed to load resource with id {loadedid}") self.set_types() self.correct_format(self.data) diff --git a/tests/hdx/data/test_resource.py b/tests/hdx/data/test_resource.py index b07b926b..4020c925 100755 --- a/tests/hdx/data/test_resource.py +++ b/tests/hdx/data/test_resource.py @@ -12,6 +12,7 @@ from .. import MockResponse, dataset_resultdict, resource_data from .test_resource_view import resource_view_list, resource_view_mocklist from hdx.api.configuration import Configuration +from hdx.data.dataset import Dataset from hdx.data.hdxobject import HDXError from hdx.data.resource import Resource from hdx.utilities.dateparse import parse_date @@ -1068,3 +1069,16 @@ def test_get_api_url(self, configuration, read): ) del resource["id"] assert resource.get_api_url() is None + + def test_get_resource_id(self, configuration, read): + resources = [ + {"id": "abcd", "name": "test_resource", "format": "CSV"}, + {"id": "efgh", "name": "test_resource2", "format": "CSV"}, + {"id": "ijkl", "name": "test_resource2", "format": "XLSX"}, + ] + dataset = Dataset({"id": "1234", "name": "test_dataset", "format": "CSV"}) + dataset.add_update_resources(resources) + + resource = Resource({"name": "test_resource2", "format": "CSV"}) + result = resource._get_resource_id(dataset=dataset) + assert result == "efgh"