Skip to content

Commit a8efe17

Browse files
Mikejmnezdcherian
andauthored
[pydap backend] enables downloading multiple dim arrays within single http request (#10629)
Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
1 parent ad92a16 commit a8efe17

File tree

5 files changed

+133
-24
lines changed

5 files changed

+133
-24
lines changed

ci/requirements/environment.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ dependencies:
3737
- pre-commit
3838
- pyarrow # pandas raises a deprecation warning without this, breaking doctests
3939
- pydap
40-
- pydap-server
4140
- pytest
4241
- pytest-asyncio
4342
- pytest-cov

doc/whats-new.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ v2025.11.1 (unreleased)
1414
New Features
1515
~~~~~~~~~~~~
1616

17+
- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree`
18+
when downloading dap4 (opendap) dimensions data (:issue:`10628`, :pull:`10629`). In addition ``checksums=True|False``
19+
is added as optional argument to be passed to ``pydap`` backend.
20+
By `Miguel Jimenez-Urias <https://github.com/Mikejmnez>`_.
21+
1722
- :py:func:`combine_nested` now support :py:class:`DataTree` objects
1823
(:pull:`10849`).
1924
By `Stephan Hoyer <https://github.com/shoyer>`_.

xarray/backends/pydap_.py

Lines changed: 64 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636

3737

3838
class PydapArrayWrapper(BackendArray):
39-
def __init__(self, array):
39+
def __init__(self, array, checksums=True):
4040
self.array = array
4141

4242
@property
@@ -54,12 +54,10 @@ def __getitem__(self, key):
5454

5555
def _getitem(self, key):
5656
result = robust_getitem(self.array, key, catch=ValueError)
57-
# in some cases, pydap doesn't squeeze axes automatically like numpy
58-
result = np.asarray(result)
57+
result = np.asarray(result.data)
5958
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
6059
if result.ndim + len(axis) != self.array.ndim and axis:
6160
result = np.squeeze(result, axis)
62-
6361
return result
6462

6563

@@ -82,7 +80,14 @@ class PydapDataStore(AbstractDataStore):
8280
be useful if the netCDF4 library is not available.
8381
"""
8482

85-
def __init__(self, dataset, group=None):
83+
def __init__(
84+
self,
85+
dataset,
86+
group=None,
87+
session=None,
88+
protocol=None,
89+
checksums=True,
90+
):
8691
"""
8792
Parameters
8893
----------
@@ -92,6 +97,8 @@ def __init__(self, dataset, group=None):
9297
"""
9398
self.dataset = dataset
9499
self.group = group
100+
self._protocol = protocol
101+
self._checksums = checksums # true by default
95102

96103
@classmethod
97104
def open(
@@ -104,6 +111,7 @@ def open(
104111
timeout=None,
105112
verify=None,
106113
user_charset=None,
114+
checksums=True,
107115
):
108116
from pydap.client import open_url
109117
from pydap.net import DEFAULT_TIMEOUT
@@ -118,6 +126,7 @@ def open(
118126
DeprecationWarning,
119127
)
120128
output_grid = False # new default behavior
129+
121130
kwargs = {
122131
"url": url,
123132
"application": application,
@@ -133,22 +142,37 @@ def open(
133142
elif hasattr(url, "ds"):
134143
# pydap dataset
135144
dataset = url.ds
136-
args = {"dataset": dataset}
145+
args = {"dataset": dataset, "checksums": checksums}
137146
if group:
138-
# only then, change the default
139147
args["group"] = group
148+
if url.startswith(("http", "dap2")):
149+
args["protocol"] = "dap2"
150+
elif url.startswith("dap4"):
151+
args["protocol"] = "dap4"
140152
return cls(**args)
141153

142154
def open_store_variable(self, var):
143-
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var))
144-
try:
155+
if hasattr(var, "dims"):
145156
dimensions = [
146157
dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims
147158
]
148-
except AttributeError:
159+
else:
149160
# GridType does not have a dims attribute - instead get `dimensions`
150161
# see https://github.com/pydap/pydap/issues/485
151162
dimensions = var.dimensions
163+
if (
164+
self._protocol == "dap4"
165+
and var.name in dimensions
166+
and hasattr(var, "dataset") # only True for pydap>3.5.5
167+
):
168+
var.dataset.enable_batch_mode()
169+
data_array = self._get_data_array(var)
170+
data = indexing.LazilyIndexedArray(data_array)
171+
var.dataset.disable_batch_mode()
172+
else:
173+
# all non-dimension variables
174+
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var))
175+
152176
return Variable(dimensions, data, var.attributes)
153177

154178
def get_variables(self):
@@ -166,6 +190,7 @@ def get_variables(self):
166190
# check the key is not a BaseType or GridType
167191
if not isinstance(self.ds[var], GroupType)
168192
]
193+
169194
return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars)
170195

171196
def get_attrs(self):
@@ -177,18 +202,33 @@ def get_attrs(self):
177202
"libdap",
178203
"invocation",
179204
"dimensions",
205+
"path",
206+
"Maps",
180207
)
181-
attrs = self.ds.attributes
182-
list(map(attrs.pop, opendap_attrs, [None] * 6))
208+
attrs = dict(self.ds.attributes)
209+
list(map(attrs.pop, opendap_attrs, [None] * len(opendap_attrs)))
183210
return Frozen(attrs)
184211

185212
def get_dimensions(self):
186-
return Frozen(self.ds.dimensions)
213+
return Frozen(sorted(self.ds.dimensions))
187214

188215
@property
189216
def ds(self):
190217
return get_group(self.dataset, self.group)
191218

219+
def _get_data_array(self, var):
220+
"""gets dimension data all at once, storing the numpy
221+
arrays within a cached dictionary
222+
"""
223+
from pydap.client import get_batch_data
224+
225+
if not var._is_data_loaded():
226+
# data has not been deserialized yet
227+
# runs only once per store/hierarchy
228+
get_batch_data(var, checksums=self._checksums)
229+
230+
return self.dataset[var.id].data
231+
192232

193233
class PydapBackendEntrypoint(BackendEntrypoint):
194234
"""
@@ -250,6 +290,7 @@ def open_dataset(
250290
timeout=None,
251291
verify=None,
252292
user_charset=None,
293+
checksums=True,
253294
) -> Dataset:
254295
store = PydapDataStore.open(
255296
url=filename_or_obj,
@@ -260,6 +301,7 @@ def open_dataset(
260301
timeout=timeout,
261302
verify=verify,
262303
user_charset=user_charset,
304+
checksums=checksums,
263305
)
264306
store_entrypoint = StoreBackendEntrypoint()
265307
with close_on_error(store):
@@ -292,6 +334,7 @@ def open_datatree(
292334
timeout=None,
293335
verify=None,
294336
user_charset=None,
337+
checksums=True,
295338
) -> DataTree:
296339
groups_dict = self.open_groups_as_dict(
297340
filename_or_obj,
@@ -303,11 +346,12 @@ def open_datatree(
303346
use_cftime=use_cftime,
304347
decode_timedelta=decode_timedelta,
305348
group=group,
306-
application=None,
307-
session=None,
308-
timeout=None,
309-
verify=None,
310-
user_charset=None,
349+
application=application,
350+
session=session,
351+
timeout=timeout,
352+
verify=verify,
353+
user_charset=user_charset,
354+
checksums=checksums,
311355
)
312356

313357
return datatree_from_dict_with_io_cleanup(groups_dict)
@@ -329,6 +373,7 @@ def open_groups_as_dict(
329373
timeout=None,
330374
verify=None,
331375
user_charset=None,
376+
checksums=True,
332377
) -> dict[str, Dataset]:
333378
from xarray.core.treenode import NodePath
334379

@@ -340,6 +385,7 @@ def open_groups_as_dict(
340385
timeout=timeout,
341386
verify=verify,
342387
user_charset=user_charset,
388+
checksums=checksums,
343389
)
344390

345391
# Check for a group and make it a parent if it exists

xarray/tests/test_backends.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6560,6 +6560,46 @@ def test_session(self) -> None:
65606560
)
65616561

65626562

6563+
@requires_pydap
6564+
@network
6565+
@pytest.mark.parametrize("protocol", ["dap2", "dap4"])
6566+
def test_batchdap4_downloads(tmpdir, protocol) -> None:
6567+
"""Test that in dap4, all dimensions are downloaded at once"""
6568+
import pydap
6569+
from pydap.net import create_session
6570+
6571+
_version_ = Version(pydap.__version__)
6572+
# Create a session with pre-set params in pydap backend, to cache urls
6573+
cache_name = tmpdir / "debug"
6574+
session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name})
6575+
session.cache.clear()
6576+
url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc"
6577+
6578+
ds = open_dataset(
6579+
url.replace("https", protocol),
6580+
session=session,
6581+
engine="pydap",
6582+
decode_times=False,
6583+
)
6584+
6585+
if protocol == "dap4":
6586+
if _version_ > Version("3.5.5"):
6587+
# total downloads are:
6588+
# 1 dmr + 1 dap (all dimensions at once)
6589+
assert len(session.cache.urls()) == 2
6590+
# now load the rest of the variables
6591+
ds.load()
6592+
# each non-dimension array is downloaded with an individual https requests
6593+
assert len(session.cache.urls()) == 2 + 4
6594+
else:
6595+
assert len(session.cache.urls()) == 4
6596+
ds.load()
6597+
assert len(session.cache.urls()) == 4 + 4
6598+
elif protocol == "dap2":
6599+
# das + dds + 3 dods urls for dimensions alone
6600+
assert len(session.cache.urls()) == 5
6601+
6602+
65636603
class TestEncodingInvalid:
65646604
def test_extract_nc4_variable_encoding(self) -> None:
65656605
var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"})

xarray/tests/test_backends_datatree.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import numpy as np
1111
import pytest
12+
from packaging.version import Version
1213

1314
import xarray as xr
1415
from xarray import DataTree, load_datatree, open_datatree, open_groups
@@ -569,8 +570,6 @@ def test_roundtrip_using_filelike_object(self, tmpdir, simple_datatree) -> None:
569570
class TestPyDAPDatatreeIO:
570571
"""Test PyDAP backend for DataTree."""
571572

572-
pytestmark = pytest.mark.xfail(reason="test.opendap.org reports a 404 error")
573-
574573
engine: T_DataTreeNetcdfEngine | None = "pydap"
575574
# you can check these by adding a .dmr to urls, and replacing dap4 with http
576575
unaligned_datatree_url = (
@@ -582,7 +581,8 @@ class TestPyDAPDatatreeIO:
582581
simplegroup_datatree_url = "dap4://test.opendap.org/opendap/dap4/SimpleGroup.nc4.h5"
583582

584583
def test_open_datatree_unaligned_hierarchy(
585-
self, url=unaligned_datatree_url
584+
self,
585+
url=unaligned_datatree_url,
586586
) -> None:
587587
with pytest.raises(
588588
ValueError,
@@ -615,7 +615,7 @@ def test_open_groups(self, url=unaligned_datatree_url) -> None:
615615
) as expected:
616616
assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected)
617617

618-
def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
618+
def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None:
619619
"""Test that `open_datatree` inherits coordinates from root tree.
620620
621621
This particular h5 file is a test file that inherits the time coordinate from the root
@@ -641,7 +641,19 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
641641
│ Temperature (time, Z, Y, X) float32 ...
642642
| Salinity (time, Z, Y, X) float32 ...
643643
"""
644-
tree = open_datatree(url, engine=self.engine)
644+
import pydap
645+
from pydap.net import create_session
646+
647+
# Create a session with pre-set retry params in pydap backend, to cache urls
648+
cache_name = tmpdir / "debug"
649+
session = create_session(
650+
use_cache=True, cache_kwargs={"cache_name": cache_name}
651+
)
652+
session.cache.clear()
653+
654+
_version_ = Version(pydap.__version__)
655+
656+
tree = open_datatree(url, engine=self.engine, session=session)
645657
assert set(tree.dims) == {"time", "Z", "nv"}
646658
assert tree["/SimpleGroup"].coords["time"].dims == ("time",)
647659
assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",)
@@ -652,6 +664,13 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
652664
list(expected.dims) + ["Z", "nv"]
653665
)
654666

667+
if _version_ > Version("3.5.5"):
668+
# Total downloads are: 1 dmr, + 1 dap url for all dimensions for each group
669+
assert len(session.cache.urls()) == 3
670+
else:
671+
# 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays)
672+
assert len(session.cache.urls()) == 5
673+
655674
def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None:
656675
aligned_dict_of_datasets = open_groups(url, engine=self.engine)
657676
aligned_dt = DataTree.from_dict(aligned_dict_of_datasets)

0 commit comments

Comments
 (0)