From 3677dec20da040a14eab8af285e7ebd1c9a9971d Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:00:37 -0700 Subject: [PATCH 01/31] update PydapArrayWrapper to support backend batching --- xarray/backends/pydap_.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index b6114e3f7af..6690ac6f1f7 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -36,8 +36,10 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array): + def __init__(self, array, batch=False, cache=None): self.array = array + self._batch = batch + self._cache = cache @property def shape(self) -> tuple[int, ...]: @@ -53,13 +55,29 @@ def __getitem__(self, key): ) def _getitem(self, key): - result = robust_getitem(self.array, key, catch=ValueError) - # in some cases, pydap doesn't squeeze axes automatically like numpy - result = np.asarray(result) + if self.array.id in self._cache.keys(): + # safely avoid re-downloading some coordinates + result = self._cache[self.array.id] + elif self._batch and hasattr(self.array, "dataset"): + # this are both True only for pydap>3.5.5 + from pydap.lib import resolve_batch_for_all_variables + + parent = self.array.parent # could be root ds | group + variables = list(parent.variables()) + resolve_batch_for_all_variables(parent, variables, key) + + result = np.asarray( + parent.dataset._current_batch_promise.wait_for_result(self.array.id) + ) + else: + result = robust_getitem(self.array, key, catch=ValueError) + try: + result = np.asarray(result.data) + except AttributeError: + result = np.asarray(result) axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) if result.ndim + len(axis) != self.array.ndim and axis: result = np.squeeze(result, axis) - return result From 41876444d6893dd7dce15db08466fe7d9ce169c7 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Mon, 10 Nov 2025 10:29:10 -0800 Subject: [PATCH 02/31] rebase --- xarray/backends/pydap_.py | 67 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 6690ac6f1f7..96a51dc8b39 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import warnings from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -100,7 +101,7 @@ class PydapDataStore(AbstractDataStore): be useful if the netCDF4 library is not available. """ - def __init__(self, dataset, group=None): + def __init__(self, dataset, group=None, session=None, batch=False, protocol=None): """ Parameters ---------- @@ -110,6 +111,11 @@ def __init__(self, dataset, group=None): """ self.dataset = dataset self.group = group + self.session = session + self._batch = batch + self._batch_done = False + self._array_cache = {} # holds 1D dimension data + self._protocol = protocol @classmethod def open( @@ -122,6 +128,7 @@ def open( timeout=None, verify=None, user_charset=None, + batch=False, ): from pydap.client import open_url from pydap.net import DEFAULT_TIMEOUT @@ -136,6 +143,7 @@ def open( DeprecationWarning, ) output_grid = False # new default behavior + kwargs = { "url": url, "application": application, @@ -153,12 +161,26 @@ def open( dataset = url.ds args = {"dataset": dataset} if group: - # only then, change the default args["group"] = group + if url.startswith(("https", "dap2")): + args["protocol"] = "dap2" + else: + args["protocol"] = "dap4" + if batch: + if args["protocol"] == "dap2": + warnings.warn( + f"`batch={batch}` is currently only compatible with the `DAP4` " + "protocol. Make sue the OPeNDAP server implements the `DAP4` " + "protocol and then replace the scheme of the url with `dap4` " + "to make use of it. Setting `batch=False`.", + stacklevel=2, + ) + else: + # only update if dap4 + args["batch"] = batch return cls(**args) def open_store_variable(self, var): - data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) try: dimensions = [ dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims @@ -167,6 +189,25 @@ def open_store_variable(self, var): # GridType does not have a dims attribute - instead get `dimensions` # see https://github.com/pydap/pydap/issues/485 dimensions = var.dimensions + if ( + self._protocol == "dap4" + and var.name in dimensions + and hasattr(var, "dataset") # only True for pydap>3.5.5 + ): + if not var.dataset._batch_mode: + # for dap4, always batch all dimensions at once + var.dataset.enable_batch_mode() + data_array = self._get_data_array(var) + data = indexing.LazilyIndexedArray(data_array) + if not self._batch and var.dataset._batch_mode: + # if `batch=False``, restore it for all other variables + var.dataset.disable_batch_mode() + else: + # all non-dimension variables + data = indexing.LazilyIndexedArray( + PydapArrayWrapper(var, self._batch, self._array_cache) + ) + return Variable(dimensions, data, var.attributes) def get_variables(self): @@ -184,6 +225,7 @@ def get_variables(self): # check the key is not a BaseType or GridType if not isinstance(self.ds[var], GroupType) ] + return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars) def get_attrs(self): @@ -195,9 +237,11 @@ def get_attrs(self): "libdap", "invocation", "dimensions", + "path", + "Maps", ) - attrs = self.ds.attributes - list(map(attrs.pop, opendap_attrs, [None] * 6)) + attrs = dict(self.ds.attributes) + list(map(attrs.pop, opendap_attrs, [None] * 8)) return Frozen(attrs) def get_dimensions(self): @@ -207,6 +251,19 @@ def get_dimensions(self): def ds(self): return get_group(self.dataset, self.group) + def _get_data_array(self, var): + """gets dimension data all at once, storing the numpy + arrays within a cached dictionary + """ + from pydap.lib import get_batch_data + + if not self._batch_done or var.id not in self._array_cache: + # store all dim data into a dict for reuse + self._array_cache = get_batch_data(var.parent, self._array_cache) + self._batch_done = True + + return self._array_cache[var.id] + class PydapBackendEntrypoint(BackendEntrypoint): """ From 729dc49965b06b1c795f024eb8d93d3f504e0c70 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:17:51 -0700 Subject: [PATCH 03/31] pydap-server it not necessary --- ci/requirements/environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index f56b2bc1d1c..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -37,7 +37,6 @@ dependencies: - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - pydap - - pydap-server - pytest - pytest-asyncio - pytest-cov From 1fd9e18964fc291660e4a294745af6fc75411e0f Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:35:52 -0700 Subject: [PATCH 04/31] set `batch=False` as default --- xarray/backends/pydap_.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 96a51dc8b39..d670aab4927 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -325,6 +325,7 @@ def open_dataset( timeout=None, verify=None, user_charset=None, + batch=False, ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, @@ -335,6 +336,7 @@ def open_dataset( timeout=timeout, verify=verify, user_charset=user_charset, + batch=batch, ) store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): From f6a78b0c93ac4a780c286ff48baab603f4768f75 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:36:40 -0700 Subject: [PATCH 05/31] set `batch=False` as default in datatree --- xarray/backends/pydap_.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index d670aab4927..68d440a9046 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -369,6 +369,7 @@ def open_datatree( timeout=None, verify=None, user_charset=None, + batch=False, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, @@ -381,10 +382,11 @@ def open_datatree( decode_timedelta=decode_timedelta, group=group, application=None, - session=None, - timeout=None, - verify=None, - user_charset=None, + session=session, + timeout=timeout, + verify=application, + user_charset=user_charset, + batch=batch, ) return datatree_from_dict_with_io_cleanup(groups_dict) From 326d925b6fab13af61736e3090ee4f9342dd54c5 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:37:42 -0700 Subject: [PATCH 06/31] set `batch=False` as default in open groups as dict --- xarray/backends/pydap_.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 68d440a9046..c601985d45c 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -408,6 +408,7 @@ def open_groups_as_dict( timeout=None, verify=None, user_charset=None, + batch=False, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -419,6 +420,7 @@ def open_groups_as_dict( timeout=timeout, verify=verify, user_charset=user_charset, + batch=batch, ) # Check for a group and make it a parent if it exists From 0f0dede403a01d219e63ca57b3a827f50c980fca Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 12:16:42 -0700 Subject: [PATCH 07/31] for flaky, install pydap from repo for now --- ci/requirements/environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index eff54fe469e..91f2a70d0d6 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pydap + # - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,3 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect + - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release From a35efa51bd13aa3f13de55b36e1d0fde50a017fc Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 17:38:11 -0700 Subject: [PATCH 08/31] initial tests - quantify cached url --- xarray/backends/pydap_.py | 4 +-- xarray/tests/test_backends.py | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index c601985d45c..08d4fed9c8e 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -162,9 +162,9 @@ def open( args = {"dataset": dataset} if group: args["group"] = group - if url.startswith(("https", "dap2")): + if url.startswith(("http", "dap2")): args["protocol"] = "dap2" - else: + elif url.startswith("dap4"): args["protocol"] = "dap4" if batch: if args["protocol"] == "dap2": diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 36a1e354d9c..1c2ccc469d1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6560,6 +6560,53 @@ def test_session(self) -> None: ) +@requires_pydap +@network +@pytest.mark.parametrize("protocol", ["dap2", "dap4"]) +def test_batchdap4_downloads(protocol) -> None: + """Test that in dap4, all dimensions are downloaded at once""" + import pydap + from requests_cache import CachedSession + + _version_ = Version(pydap.__version__) + session = CachedSession() + session.cache.clear() + url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" + + open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + ) + if protocol == "dap4": + if _version_ > Version("3.5.5"): + # should download 2 urls only (1 dmr and 1 dap) + assert len(session.cache.urls()) == 2 + else: + assert len(session.cache.urls()) == 4 + # das + dds + 3 dods urls + elif protocol == "dap2": + assert len(session.cache.urls()) == 5 + + +@requires_pydap +@network +def test_batch_warnswithdap2() -> None: + from requests_cache import CachedSession + + session = CachedSession() + session.cache.clear() + url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" + with pytest.warns(UserWarning): + open_dataset( + url, engine="pydap", session=session, batch=True, decode_times=False + ) + + # no batching is supported here + assert len(session.cache.urls()) == 5 + + class TestEncodingInvalid: def test_extract_nc4_variable_encoding(self) -> None: var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"}) From fcb2eaea8803c9801ef77c24c0efb2c6c1db43d3 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 21:30:40 -0700 Subject: [PATCH 09/31] adds tests to datatree backend to assert multiple dimensions downloaded at once (per group) --- xarray/tests/test_backends_datatree.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index f534752c52d..58ad8029bfb 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from packaging.version import Version import xarray as xr from xarray import DataTree, load_datatree, open_datatree, open_groups @@ -641,7 +642,15 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: │ Temperature (time, Z, Y, X) float32 ... | Salinity (time, Z, Y, X) float32 ... """ - tree = open_datatree(url, engine=self.engine) + import pydap + from requests_cache import CachedSession + + _version_ = Version(pydap.__version__) + + session = CachedSession() + session.cache.clear() + + tree = open_datatree(url, engine=self.engine, session=session) assert set(tree.dims) == {"time", "Z", "nv"} assert tree["/SimpleGroup"].coords["time"].dims == ("time",) assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",) @@ -652,6 +661,19 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: list(expected.dims) + ["Z", "nv"] ) + # group (including root). So in this case 3. In the future there + # should a only be 2 downloads (all dimensions should be downloaded) + # within single + + if _version_ > Version("3.5.5"): + # Total downloads are: 1 dmr, + 1 dap url per Group | root. + # since there is a group then 2 dap url. In the future there + # should only be 1 dap url downloaded. + assert len(session.cache.urls()) == 3 + else: + # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) + assert len(session.cache.urls()) == 5 + def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None: aligned_dict_of_datasets = open_groups(url, engine=self.engine) aligned_dt = DataTree.from_dict(aligned_dict_of_datasets) From 677e3dee5d8431dd1299868139318cbd60c820f1 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 22:47:22 -0700 Subject: [PATCH 10/31] update testing to show number of download urls --- xarray/tests/test_backends.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1c2ccc469d1..eefbbc36543 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6563,7 +6563,8 @@ def test_session(self) -> None: @requires_pydap @network @pytest.mark.parametrize("protocol", ["dap2", "dap4"]) -def test_batchdap4_downloads(protocol) -> None: +@pytest.mark.parametrize("batch", [False, True]) +def test_batchdap4_downloads(protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap from requests_cache import CachedSession @@ -6573,20 +6574,36 @@ def test_batchdap4_downloads(protocol) -> None: session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - open_dataset( - url.replace("https", protocol), - engine="pydap", - session=session, - decode_times=False, - ) + args = { + "filename_or_obj": url.replace("https", protocol), + "engine": "pydap", + "session": session, + "decode_times": False, + } + if protocol == "dap4": + ds = open_dataset(**args, batch=batch) if _version_ > Version("3.5.5"): - # should download 2 urls only (1 dmr and 1 dap) + # total downloads are: + # 1 dmr + 1 dap (dimensions) assert len(session.cache.urls()) == 2 + # now load the rest of the variables + ds.load() + if batch: + # all non-dimensions are downloaded in a single https requests + assert len(session.cache.urls()) == 2 + 1 + if not batch: + # each non-dimension array is downloaded with an individual + # https requests + assert len(session.cache.urls()) == 2 + 4 else: assert len(session.cache.urls()) == 4 - # das + dds + 3 dods urls + ds.load() + assert len(session.cache.urls()) == 4 + 4 elif protocol == "dap2": + ds = open_dataset(**args) + # das + dds + 3 dods urls + assert len(session.cache.urls()) == 5 From 7f05a6a8efec67064bc0edcae8fb181ea8362a9f Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:01:01 -0700 Subject: [PATCH 11/31] simplified logic --- xarray/backends/pydap_.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 08d4fed9c8e..215c38ad6e5 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -63,12 +63,10 @@ def _getitem(self, key): # this are both True only for pydap>3.5.5 from pydap.lib import resolve_batch_for_all_variables - parent = self.array.parent # could be root ds | group - variables = list(parent.variables()) - resolve_batch_for_all_variables(parent, variables, key) - + dataset = self.array.dataset + resolve_batch_for_all_variables(self.array, key) result = np.asarray( - parent.dataset._current_batch_promise.wait_for_result(self.array.id) + dataset._current_batch_promise.wait_for_result(self.array.id) ) else: result = robust_getitem(self.array, key, catch=ValueError) From e360560d0ec0c01d576d2d7a101b0bc495526c14 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:01:24 -0700 Subject: [PATCH 12/31] specify cached session debug name to actually cache urls --- xarray/tests/test_backends.py | 2 +- xarray/tests/test_backends_datatree.py | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index eefbbc36543..b82a81fe1fb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6570,7 +6570,7 @@ def test_batchdap4_downloads(protocol, batch) -> None: from requests_cache import CachedSession _version_ = Version(pydap.__version__) - session = CachedSession() + session = CachedSession(cache_name="debug") # so that urls are cached session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 58ad8029bfb..e62088cc619 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -647,7 +647,7 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: _version_ = Version(pydap.__version__) - session = CachedSession() + session = CachedSession(cache_name="debug") # so that urls are cached session.cache.clear() tree = open_datatree(url, engine=self.engine, session=session) @@ -661,15 +661,9 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: list(expected.dims) + ["Z", "nv"] ) - # group (including root). So in this case 3. In the future there - # should a only be 2 downloads (all dimensions should be downloaded) - # within single - if _version_ > Version("3.5.5"): - # Total downloads are: 1 dmr, + 1 dap url per Group | root. - # since there is a group then 2 dap url. In the future there - # should only be 1 dap url downloaded. - assert len(session.cache.urls()) == 3 + # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups + assert len(session.cache.urls()) == 2 else: # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) assert len(session.cache.urls()) == 5 From c6ed8bf62c7ab556276525ac59b0b97033d0d232 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:24:30 -0700 Subject: [PATCH 13/31] fix for mypy --- xarray/tests/test_backends.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b82a81fe1fb..ce3825a882b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6574,15 +6574,14 @@ def test_batchdap4_downloads(protocol, batch) -> None: session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - args = { - "filename_or_obj": url.replace("https", protocol), - "engine": "pydap", - "session": session, - "decode_times": False, - } - if protocol == "dap4": - ds = open_dataset(**args, batch=batch) + ds = open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + batch=batch, + ) if _version_ > Version("3.5.5"): # total downloads are: # 1 dmr + 1 dap (dimensions) @@ -6601,9 +6600,13 @@ def test_batchdap4_downloads(protocol, batch) -> None: ds.load() assert len(session.cache.urls()) == 4 + 4 elif protocol == "dap2": - ds = open_dataset(**args) + ds = open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + ) # das + dds + 3 dods urls - assert len(session.cache.urls()) == 5 From 54f6f8dd7838f705eb41b5a44b34cb5f499ac990 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:21:26 -0700 Subject: [PATCH 14/31] user visible changes on `whats-new.rst` --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 63a002b6f31..a36681a905a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -414,6 +414,9 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading + dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables + downloading multiple arrays in single response. Breaking changes ~~~~~~~~~~~~~~~~ From 419b25eb060e9cdd722e4e05dad16c306595a2bb Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:50:21 -0700 Subject: [PATCH 15/31] impose sorted to `get_dimensions` method --- xarray/backends/pydap_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 215c38ad6e5..1d9d498678b 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -243,7 +243,7 @@ def get_attrs(self): return Frozen(attrs) def get_dimensions(self): - return Frozen(self.ds.dimensions) + return Frozen(sorted(self.ds.dimensions)) @property def ds(self): From 747fcc7f211c231abedf7e9d452f23baad6967fc Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:52:45 -0700 Subject: [PATCH 16/31] reformat `whats-new.rst` --- doc/whats-new.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a36681a905a..18ff81a2bee 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -414,9 +414,8 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading - dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables - downloading multiple arrays in single response. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. + By `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ From 381c499944c2f9ffecf79a82217a9f24193bf722 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 10:12:24 -0700 Subject: [PATCH 17/31] revert to install pydap from conda and not from repo --- ci/requirements/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 91f2a70d0d6..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - # - pydap + - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,4 +65,3 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release From 5f5c4e1c0027179299b487397b4216d7f1673820 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 14:34:26 -0700 Subject: [PATCH 18/31] expose checksum as user kwarg --- ci/requirements/environment.yml | 3 ++- xarray/backends/pydap_.py | 30 +++++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index eff54fe469e..91f2a70d0d6 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pydap + # - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,3 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect + - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 1d9d498678b..a681a0cbcd5 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -37,10 +37,11 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=False, cache=None): + def __init__(self, array, batch=False, cache=None, checksums=True): self.array = array self._batch = batch self._cache = cache + self._checksums = checksums @property def shape(self) -> tuple[int, ...]: @@ -64,7 +65,7 @@ def _getitem(self, key): from pydap.lib import resolve_batch_for_all_variables dataset = self.array.dataset - resolve_batch_for_all_variables(self.array, key) + resolve_batch_for_all_variables(self.array, key, checksums=self._checksums) result = np.asarray( dataset._current_batch_promise.wait_for_result(self.array.id) ) @@ -99,7 +100,15 @@ class PydapDataStore(AbstractDataStore): be useful if the netCDF4 library is not available. """ - def __init__(self, dataset, group=None, session=None, batch=False, protocol=None): + def __init__( + self, + dataset, + group=None, + session=None, + batch=False, + protocol=None, + checksums=True, + ): """ Parameters ---------- @@ -114,6 +123,7 @@ def __init__(self, dataset, group=None, session=None, batch=False, protocol=None self._batch_done = False self._array_cache = {} # holds 1D dimension data self._protocol = protocol + self._checksums = checksums # true by default @classmethod def open( @@ -127,6 +137,7 @@ def open( verify=None, user_charset=None, batch=False, + checksums=True, ): from pydap.client import open_url from pydap.net import DEFAULT_TIMEOUT @@ -158,6 +169,7 @@ def open( # pydap dataset dataset = url.ds args = {"dataset": dataset} + args["checksums"] = checksums if group: args["group"] = group if url.startswith(("http", "dap2")): @@ -203,7 +215,7 @@ def open_store_variable(self, var): else: # all non-dimension variables data = indexing.LazilyIndexedArray( - PydapArrayWrapper(var, self._batch, self._array_cache) + PydapArrayWrapper(var, self._batch, self._array_cache, self._checksums) ) return Variable(dimensions, data, var.attributes) @@ -257,7 +269,9 @@ def _get_data_array(self, var): if not self._batch_done or var.id not in self._array_cache: # store all dim data into a dict for reuse - self._array_cache = get_batch_data(var.parent, self._array_cache) + self._array_cache = get_batch_data( + var.parent, self._array_cache, self._checksums + ) self._batch_done = True return self._array_cache[var.id] @@ -324,6 +338,7 @@ def open_dataset( verify=None, user_charset=None, batch=False, + checksums=True, ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, @@ -335,6 +350,7 @@ def open_dataset( verify=verify, user_charset=user_charset, batch=batch, + checksums=checksums, ) store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): @@ -368,6 +384,7 @@ def open_datatree( verify=None, user_charset=None, batch=False, + checksums=True, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, @@ -385,6 +402,7 @@ def open_datatree( verify=application, user_charset=user_charset, batch=batch, + checksums=checksums, ) return datatree_from_dict_with_io_cleanup(groups_dict) @@ -407,6 +425,7 @@ def open_groups_as_dict( verify=None, user_charset=None, batch=False, + checksums=True, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -419,6 +438,7 @@ def open_groups_as_dict( verify=verify, user_charset=user_charset, batch=batch, + checksums=checksums, ) # Check for a group and make it a parent if it exists From e15f8cb88cbf47ee6e47d2c6bb4c96b2984f0251 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 14:47:17 -0700 Subject: [PATCH 19/31] include `checksums` optional argument in `whats-new` --- doc/whats-new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 18ff81a2bee..b3be1624b81 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -414,7 +414,8 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. By `Miguel Jimenez-Urias `_. Breaking changes From 0a2730c9d011ac3d72bdf11815aa7d44f09326d6 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 15:06:08 -0700 Subject: [PATCH 20/31] update to newest release of pydap via pip until conda install is available --- ci/requirements/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 91f2a70d0d6..b012fe82e56 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -65,4 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release + - pydap==3.5.6 # just for now - will restore to conda after new release From a5d2b0f4891ef1ea816c234f1196bf476466ba61 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 16:12:24 -0700 Subject: [PATCH 21/31] use requests_cache session with retry-params when 500 errors occur --- xarray/backends/pydap_.py | 1 - xarray/tests/test_backends.py | 11 +++++++---- xarray/tests/test_backends_datatree.py | 9 +++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index a681a0cbcd5..c2ae13e777c 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -118,7 +118,6 @@ def __init__( """ self.dataset = dataset self.group = group - self.session = session self._batch = batch self._batch_done = False self._array_cache = {} # holds 1D dimension data diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ce3825a882b..18c68b337c0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6567,10 +6567,11 @@ def test_session(self) -> None: def test_batchdap4_downloads(protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap - from requests_cache import CachedSession + from pydap.net import create_session _version_ = Version(pydap.__version__) - session = CachedSession(cache_name="debug") # so that urls are cached + # Create a session with pre-set params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" @@ -6613,10 +6614,12 @@ def test_batchdap4_downloads(protocol, batch) -> None: @requires_pydap @network def test_batch_warnswithdap2() -> None: - from requests_cache import CachedSession + from pydap.net import create_session - session = CachedSession() + # Create a session with pre-set retry params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() + url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" with pytest.warns(UserWarning): open_dataset( diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index e62088cc619..2d2676c92b7 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -643,13 +643,14 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: | Salinity (time, Z, Y, X) float32 ... """ import pydap - from requests_cache import CachedSession + from pydap.net import create_session - _version_ = Version(pydap.__version__) - - session = CachedSession(cache_name="debug") # so that urls are cached + # Create a session with pre-set retry params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() + _version_ = Version(pydap.__version__) + tree = open_datatree(url, engine=self.engine, session=session) assert set(tree.dims) == {"time", "Z", "nv"} assert tree["/SimpleGroup"].coords["time"].dims == ("time",) From 9a88316a114c622280f4d5a3802097a63e351dcd Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Thu, 14 Aug 2025 09:45:07 -0700 Subject: [PATCH 22/31] update env yml file to use new pydap release via conda --- ci/requirements/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index b012fe82e56..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - # - pydap + - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,4 +65,3 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - pydap==3.5.6 # just for now - will restore to conda after new release From d2835ab745db05c46eafee7628c4d32d3804f097 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Mon, 10 Nov 2025 10:31:56 -0800 Subject: [PATCH 23/31] turn on testing on datatree from test.opendap.org --- xarray/tests/test_backends_datatree.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 2d2676c92b7..e2d4913ae55 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -570,8 +570,6 @@ def test_roundtrip_using_filelike_object(self, tmpdir, simple_datatree) -> None: class TestPyDAPDatatreeIO: """Test PyDAP backend for DataTree.""" - pytestmark = pytest.mark.xfail(reason="test.opendap.org reports a 404 error") - engine: T_DataTreeNetcdfEngine | None = "pydap" # you can check these by adding a .dmr to urls, and replacing dap4 with http unaligned_datatree_url = ( From b60adb556e67e5b59fe8c8d90acc9d6a4d74ef93 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Mon, 10 Nov 2025 11:46:56 -0800 Subject: [PATCH 24/31] rebase with main --- xarray/backends/pydap_.py | 75 +++++++++----------------- xarray/tests/test_backends.py | 24 ++------- xarray/tests/test_backends_datatree.py | 14 +++-- 3 files changed, 37 insertions(+), 76 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index c2ae13e777c..5e27eb53545 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,7 +1,6 @@ from __future__ import annotations import os -import warnings from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -37,10 +36,9 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=False, cache=None, checksums=True): + def __init__(self, array, batch=None, checksums=True): self.array = array self._batch = batch - self._cache = cache self._checksums = checksums @property @@ -57,27 +55,19 @@ def __getitem__(self, key): ) def _getitem(self, key): - if self.array.id in self._cache.keys(): - # safely avoid re-downloading some coordinates - result = self._cache[self.array.id] - elif self._batch and hasattr(self.array, "dataset"): + if self._batch and hasattr(self.array, "dataset"): # this are both True only for pydap>3.5.5 - from pydap.lib import resolve_batch_for_all_variables + from pydap.client import data_check, get_batch_data dataset = self.array.dataset - resolve_batch_for_all_variables(self.array, key, checksums=self._checksums) - result = np.asarray( - dataset._current_batch_promise.wait_for_result(self.array.id) - ) + get_batch_data(self.array, checksums=self._checksums, key=key) + result = data_check(np.asarray(dataset[self.array.id].data), key) else: result = robust_getitem(self.array, key, catch=ValueError) - try: - result = np.asarray(result.data) - except AttributeError: - result = np.asarray(result) - axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) - if result.ndim + len(axis) != self.array.ndim and axis: - result = np.squeeze(result, axis) + result = np.asarray(result.data) + axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) + if result.ndim + len(axis) != self.array.ndim and axis: + result = np.squeeze(result, axis) return result @@ -105,7 +95,7 @@ def __init__( dataset, group=None, session=None, - batch=False, + batch=None, protocol=None, checksums=True, ): @@ -119,8 +109,6 @@ def __init__( self.dataset = dataset self.group = group self._batch = batch - self._batch_done = False - self._array_cache = {} # holds 1D dimension data self._protocol = protocol self._checksums = checksums # true by default @@ -135,7 +123,7 @@ def open( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ): from pydap.client import open_url @@ -167,8 +155,7 @@ def open( elif hasattr(url, "ds"): # pydap dataset dataset = url.ds - args = {"dataset": dataset} - args["checksums"] = checksums + args = {"dataset": dataset, "checksums": checksums} if group: args["group"] = group if url.startswith(("http", "dap2")): @@ -176,25 +163,15 @@ def open( elif url.startswith("dap4"): args["protocol"] = "dap4" if batch: - if args["protocol"] == "dap2": - warnings.warn( - f"`batch={batch}` is currently only compatible with the `DAP4` " - "protocol. Make sue the OPeNDAP server implements the `DAP4` " - "protocol and then replace the scheme of the url with `dap4` " - "to make use of it. Setting `batch=False`.", - stacklevel=2, - ) - else: - # only update if dap4 - args["batch"] = batch + args["batch"] = batch return cls(**args) def open_store_variable(self, var): - try: + if hasattr(var, "dims"): dimensions = [ dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims ] - except AttributeError: + else: # GridType does not have a dims attribute - instead get `dimensions` # see https://github.com/pydap/pydap/issues/485 dimensions = var.dimensions @@ -214,7 +191,7 @@ def open_store_variable(self, var): else: # all non-dimension variables data = indexing.LazilyIndexedArray( - PydapArrayWrapper(var, self._batch, self._array_cache, self._checksums) + PydapArrayWrapper(var, self._batch, self._checksums) ) return Variable(dimensions, data, var.attributes) @@ -264,16 +241,14 @@ def _get_data_array(self, var): """gets dimension data all at once, storing the numpy arrays within a cached dictionary """ - from pydap.lib import get_batch_data + from pydap.client import get_batch_data - if not self._batch_done or var.id not in self._array_cache: - # store all dim data into a dict for reuse - self._array_cache = get_batch_data( - var.parent, self._array_cache, self._checksums - ) - self._batch_done = True + if not var._is_data_loaded(): + # data has not been deserialized yet + # runs only once per store/hierarchy + get_batch_data(var, checksums=self._checksums) - return self._array_cache[var.id] + return self.dataset[var.id].data class PydapBackendEntrypoint(BackendEntrypoint): @@ -336,7 +311,7 @@ def open_dataset( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> Dataset: store = PydapDataStore.open( @@ -382,7 +357,7 @@ def open_datatree( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> DataTree: groups_dict = self.open_groups_as_dict( @@ -423,7 +398,7 @@ def open_groups_as_dict( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 18c68b337c0..ddd0be4f92c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6564,14 +6564,15 @@ def test_session(self) -> None: @network @pytest.mark.parametrize("protocol", ["dap2", "dap4"]) @pytest.mark.parametrize("batch", [False, True]) -def test_batchdap4_downloads(protocol, batch) -> None: +def test_batchdap4_downloads(tmpdir, protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap from pydap.net import create_session _version_ = Version(pydap.__version__) # Create a session with pre-set params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) + cache_name = tmpdir / "debug" + session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name}) session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" @@ -6611,25 +6612,6 @@ def test_batchdap4_downloads(protocol, batch) -> None: assert len(session.cache.urls()) == 5 -@requires_pydap -@network -def test_batch_warnswithdap2() -> None: - from pydap.net import create_session - - # Create a session with pre-set retry params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) - session.cache.clear() - - url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - with pytest.warns(UserWarning): - open_dataset( - url, engine="pydap", session=session, batch=True, decode_times=False - ) - - # no batching is supported here - assert len(session.cache.urls()) == 5 - - class TestEncodingInvalid: def test_extract_nc4_variable_encoding(self) -> None: var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"}) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index e2d4913ae55..38ec3ef83fb 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -581,7 +581,8 @@ class TestPyDAPDatatreeIO: simplegroup_datatree_url = "dap4://test.opendap.org/opendap/dap4/SimpleGroup.nc4.h5" def test_open_datatree_unaligned_hierarchy( - self, url=unaligned_datatree_url + self, + url=unaligned_datatree_url, ) -> None: with pytest.raises( ValueError, @@ -614,7 +615,7 @@ def test_open_groups(self, url=unaligned_datatree_url) -> None: ) as expected: assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected) - def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: + def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None: """Test that `open_datatree` inherits coordinates from root tree. This particular h5 file is a test file that inherits the time coordinate from the root @@ -644,7 +645,10 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: from pydap.net import create_session # Create a session with pre-set retry params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) + cache_name = tmpdir / "debug" + session = create_session( + use_cache=True, cache_kwargs={"cache_name": cache_name} + ) session.cache.clear() _version_ = Version(pydap.__version__) @@ -661,8 +665,8 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: ) if _version_ > Version("3.5.5"): - # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups - assert len(session.cache.urls()) == 2 + # Total downloads are: 1 dmr, + 1 dap url for all dimensions for each group + assert len(session.cache.urls()) == 3 else: # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) assert len(session.cache.urls()) == 5 From 578b31a2b976d075d8383fd0265871d6f7c7ad0a Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Mon, 10 Nov 2025 11:50:36 -0800 Subject: [PATCH 25/31] update what`s new --- doc/whats-new.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b3be1624b81..273523074b5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,6 +14,16 @@ v2025.11.1 (unreleased) New Features ~~~~~~~~~~~~ +- :py:func:`merge` and :py:func:`concat` now support :py:class:`DataTree` + objects (:issue:`9790`, :issue:`9778`). + By `Stephan Hoyer `_. +- The ``h5netcdf`` engine has support for pseudo ``NETCDF4_CLASSIC`` files, meaning variables and attributes are cast to supported types. Note that the saved files won't be recognized as genuine ``NETCDF4_CLASSIC`` files until ``h5netcdf`` adds support with version 1.7.0. (:issue:`10676`, :pull:`10686`). + By `David Huard `_. +- Support comparing :py:class:`DataTree` objects with :py:func:`testing.assert_allclose` (:pull:`10887`). + By `Justus Magin `_. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. Breaking Changes ~~~~~~~~~~~~~~~~ @@ -414,9 +424,6 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. - By `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ From 25b08cd5b8d30f02760e161fa307f6eb0f377987 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 12 Nov 2025 08:33:49 -0800 Subject: [PATCH 26/31] removes batch as arg - acts always but only on dimension data arrays --- xarray/backends/pydap_.py | 45 ++++++++------------------------------- 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 5e27eb53545..7ea140fdd99 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -36,10 +36,8 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=None, checksums=True): + def __init__(self, array, checksums=True): self.array = array - self._batch = batch - self._checksums = checksums @property def shape(self) -> tuple[int, ...]: @@ -55,19 +53,11 @@ def __getitem__(self, key): ) def _getitem(self, key): - if self._batch and hasattr(self.array, "dataset"): - # this are both True only for pydap>3.5.5 - from pydap.client import data_check, get_batch_data - - dataset = self.array.dataset - get_batch_data(self.array, checksums=self._checksums, key=key) - result = data_check(np.asarray(dataset[self.array.id].data), key) - else: - result = robust_getitem(self.array, key, catch=ValueError) - result = np.asarray(result.data) - axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) - if result.ndim + len(axis) != self.array.ndim and axis: - result = np.squeeze(result, axis) + result = robust_getitem(self.array, key, catch=ValueError) + result = np.asarray(result.data) + axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) + if result.ndim + len(axis) != self.array.ndim and axis: + result = np.squeeze(result, axis) return result @@ -95,7 +85,6 @@ def __init__( dataset, group=None, session=None, - batch=None, protocol=None, checksums=True, ): @@ -108,7 +97,6 @@ def __init__( """ self.dataset = dataset self.group = group - self._batch = batch self._protocol = protocol self._checksums = checksums # true by default @@ -123,7 +111,6 @@ def open( timeout=None, verify=None, user_charset=None, - batch=None, checksums=True, ): from pydap.client import open_url @@ -162,8 +149,6 @@ def open( args["protocol"] = "dap2" elif url.startswith("dap4"): args["protocol"] = "dap4" - if batch: - args["batch"] = batch return cls(**args) def open_store_variable(self, var): @@ -180,19 +165,13 @@ def open_store_variable(self, var): and var.name in dimensions and hasattr(var, "dataset") # only True for pydap>3.5.5 ): - if not var.dataset._batch_mode: - # for dap4, always batch all dimensions at once - var.dataset.enable_batch_mode() + var.dataset.enable_batch_mode() data_array = self._get_data_array(var) data = indexing.LazilyIndexedArray(data_array) - if not self._batch and var.dataset._batch_mode: - # if `batch=False``, restore it for all other variables - var.dataset.disable_batch_mode() + var.dataset.disable_batch_mode() else: # all non-dimension variables - data = indexing.LazilyIndexedArray( - PydapArrayWrapper(var, self._batch, self._checksums) - ) + data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) return Variable(dimensions, data, var.attributes) @@ -311,7 +290,6 @@ def open_dataset( timeout=None, verify=None, user_charset=None, - batch=None, checksums=True, ) -> Dataset: store = PydapDataStore.open( @@ -323,7 +301,6 @@ def open_dataset( timeout=timeout, verify=verify, user_charset=user_charset, - batch=batch, checksums=checksums, ) store_entrypoint = StoreBackendEntrypoint() @@ -357,7 +334,6 @@ def open_datatree( timeout=None, verify=None, user_charset=None, - batch=None, checksums=True, ) -> DataTree: groups_dict = self.open_groups_as_dict( @@ -375,7 +351,6 @@ def open_datatree( timeout=timeout, verify=application, user_charset=user_charset, - batch=batch, checksums=checksums, ) @@ -398,7 +373,6 @@ def open_groups_as_dict( timeout=None, verify=None, user_charset=None, - batch=None, checksums=True, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -411,7 +385,6 @@ def open_groups_as_dict( timeout=timeout, verify=verify, user_charset=user_charset, - batch=batch, checksums=checksums, ) From 0e1ff6c9e9b885a0be7ffc8f4f45021c2538c709 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 12 Nov 2025 08:54:17 -0800 Subject: [PATCH 27/31] updates tests --- xarray/tests/test_backends.py | 36 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ddd0be4f92c..296e9541b33 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6563,8 +6563,7 @@ def test_session(self) -> None: @requires_pydap @network @pytest.mark.parametrize("protocol", ["dap2", "dap4"]) -@pytest.mark.parametrize("batch", [False, True]) -def test_batchdap4_downloads(tmpdir, protocol, batch) -> None: +def test_batchdap4_downloads(tmpdir, protocol) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap from pydap.net import create_session @@ -6576,39 +6575,28 @@ def test_batchdap4_downloads(tmpdir, protocol, batch) -> None: session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" + ds = open_dataset( + url.replace("https", protocol), + session=session, + engine="pydap", + decode_times=False, + ) + if protocol == "dap4": - ds = open_dataset( - url.replace("https", protocol), - engine="pydap", - session=session, - decode_times=False, - batch=batch, - ) if _version_ > Version("3.5.5"): # total downloads are: - # 1 dmr + 1 dap (dimensions) + # 1 dmr + 1 dap (all dimensions at once) assert len(session.cache.urls()) == 2 # now load the rest of the variables ds.load() - if batch: - # all non-dimensions are downloaded in a single https requests - assert len(session.cache.urls()) == 2 + 1 - if not batch: - # each non-dimension array is downloaded with an individual - # https requests - assert len(session.cache.urls()) == 2 + 4 + # each non-dimension array is downloaded with an individual https requests + assert len(session.cache.urls()) == 2 + 4 else: assert len(session.cache.urls()) == 4 ds.load() assert len(session.cache.urls()) == 4 + 4 elif protocol == "dap2": - ds = open_dataset( - url.replace("https", protocol), - engine="pydap", - session=session, - decode_times=False, - ) - # das + dds + 3 dods urls + # das + dds + 3 dods urls for dimensions alone assert len(session.cache.urls()) == 5 From f4f253ab40abcf795d8ccde236359becaaab2d95 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 12 Nov 2025 09:03:30 -0800 Subject: [PATCH 28/31] update `whats new` --- doc/whats-new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 273523074b5..1b8c2995c71 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,8 +21,8 @@ New Features By `David Huard `_. - Support comparing :py:class:`DataTree` objects with :py:func:`testing.assert_allclose` (:pull:`10887`). By `Justus Magin `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) dimensions data (:issue:`10628`, :pull:`10629`). + In addition ``checksums=True|False`` is added as optional argument to be passed to ``pydap`` backend. By `Miguel Jimenez-Urias `_. Breaking Changes From b4c7ddabb747a9ace27e43dbee43a1a1c2b0c521 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 18 Nov 2025 20:48:12 -0800 Subject: [PATCH 29/31] minor code changes --- xarray/backends/pydap_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 7ea140fdd99..9976e180e1b 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -206,7 +206,7 @@ def get_attrs(self): "Maps", ) attrs = dict(self.ds.attributes) - list(map(attrs.pop, opendap_attrs, [None] * 8)) + list(map(attrs.pop, opendap_attrs, [None] * len(opendap_attrs))) return Frozen(attrs) def get_dimensions(self): @@ -346,10 +346,10 @@ def open_datatree( use_cftime=use_cftime, decode_timedelta=decode_timedelta, group=group, - application=None, + application=application, session=session, timeout=timeout, - verify=application, + verify=verify, user_charset=user_charset, checksums=checksums, ) From ced359f4c16e294fd01173f632d4a004108d835b Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 18 Nov 2025 21:02:05 -0800 Subject: [PATCH 30/31] fix `whats new` changes --- doc/whats-new.rst | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1b8c2995c71..d7f598b6891 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,17 +14,11 @@ v2025.11.1 (unreleased) New Features ~~~~~~~~~~~~ -- :py:func:`merge` and :py:func:`concat` now support :py:class:`DataTree` - objects (:issue:`9790`, :issue:`9778`). - By `Stephan Hoyer `_. -- The ``h5netcdf`` engine has support for pseudo ``NETCDF4_CLASSIC`` files, meaning variables and attributes are cast to supported types. Note that the saved files won't be recognized as genuine ``NETCDF4_CLASSIC`` files until ``h5netcdf`` adds support with version 1.7.0. (:issue:`10676`, :pull:`10686`). - By `David Huard `_. -- Support comparing :py:class:`DataTree` objects with :py:func:`testing.assert_allclose` (:pull:`10887`). - By `Justus Magin `_. - Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) dimensions data (:issue:`10628`, :pull:`10629`). In addition ``checksums=True|False`` is added as optional argument to be passed to ``pydap`` backend. By `Miguel Jimenez-Urias `_. + Breaking Changes ~~~~~~~~~~~~~~~~ From e789324ca90dbea86cc5619c226f53846a7d1b48 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 18 Nov 2025 21:05:20 -0800 Subject: [PATCH 31/31] formatting --- doc/whats-new.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d7f598b6891..e26b8729434 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,8 +14,9 @@ v2025.11.1 (unreleased) New Features ~~~~~~~~~~~~ -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) dimensions data (:issue:`10628`, :pull:`10629`). - In addition ``checksums=True|False`` is added as optional argument to be passed to ``pydap`` backend. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` + when downloading dap4 (opendap) dimensions data (:issue:`10628`, :pull:`10629`). In addition ``checksums=True|False`` + is added as optional argument to be passed to ``pydap`` backend. By `Miguel Jimenez-Urias `_.