Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3677dec
update PydapArrayWrapper to support backend batching
Mikejmnez Aug 12, 2025
4187644
rebase
Mikejmnez Nov 10, 2025
729dc49
pydap-server it not necessary
Mikejmnez Aug 12, 2025
1fd9e18
set `batch=False` as default
Mikejmnez Aug 12, 2025
f6a78b0
set `batch=False` as default in datatree
Mikejmnez Aug 12, 2025
326d925
set `batch=False` as default in open groups as dict
Mikejmnez Aug 12, 2025
0f0dede
for flaky, install pydap from repo for now
Mikejmnez Aug 12, 2025
a35efa5
initial tests - quantify cached url
Mikejmnez Aug 13, 2025
fcb2eae
adds tests to datatree backend to assert multiple dimensions download…
Mikejmnez Aug 13, 2025
677e3de
update testing to show number of download urls
Mikejmnez Aug 13, 2025
7f05a6a
simplified logic
Mikejmnez Aug 13, 2025
e360560
specify cached session debug name to actually cache urls
Mikejmnez Aug 13, 2025
c6ed8bf
fix for mypy
Mikejmnez Aug 13, 2025
54f6f8d
user visible changes on `whats-new.rst`
Mikejmnez Aug 13, 2025
419b25e
impose sorted to `get_dimensions` method
Mikejmnez Aug 13, 2025
747fcc7
reformat `whats-new.rst`
Mikejmnez Aug 13, 2025
381c499
revert to install pydap from conda and not from repo
Mikejmnez Aug 13, 2025
5f5c4e1
expose checksum as user kwarg
Mikejmnez Aug 13, 2025
e15f8cb
include `checksums` optional argument in `whats-new`
Mikejmnez Aug 13, 2025
0a2730c
update to newest release of pydap via pip until conda install is avai…
Mikejmnez Aug 13, 2025
a5d2b0f
use requests_cache session with retry-params when 500 errors occur
Mikejmnez Aug 13, 2025
9a88316
update env yml file to use new pydap release via conda
Mikejmnez Aug 14, 2025
d2835ab
turn on testing on datatree from test.opendap.org
Mikejmnez Nov 10, 2025
b60adb5
rebase with main
Mikejmnez Nov 10, 2025
578b31a
update what`s new
Mikejmnez Nov 10, 2025
25b08cd
removes batch as arg - acts always but only on dimension data arrays
Mikejmnez Nov 12, 2025
0e1ff6c
updates tests
Mikejmnez Nov 12, 2025
f4f253a
update `whats new`
Mikejmnez Nov 12, 2025
b4c7dda
minor code changes
Mikejmnez Nov 19, 2025
ced359f
fix `whats new` changes
Mikejmnez Nov 19, 2025
e789324
formatting
Mikejmnez Nov 19, 2025
7bcbd7c
Merge branch 'main' into pydap4_scale
dcherian Nov 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ci/requirements/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ dependencies:
- pre-commit
- pyarrow # pandas raises a deprecation warning without this, breaking doctests
- pydap
- pydap-server
- pytest
- pytest-asyncio
- pytest-cov
Expand Down
5 changes: 5 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ v2025.11.1 (unreleased)
New Features
~~~~~~~~~~~~

- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree`
when downloading dap4 (opendap) dimensions data (:issue:`10628`, :pull:`10629`). In addition ``checksums=True|False``
is added as optional argument to be passed to ``pydap`` backend.
By `Miguel Jimenez-Urias <https://github.com/Mikejmnez>`_.

- :py:func:`combine_nested` now support :py:class:`DataTree` objects
(:pull:`10849`).
By `Stephan Hoyer <https://github.com/shoyer>`_.
Expand Down
82 changes: 64 additions & 18 deletions xarray/backends/pydap_.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@


class PydapArrayWrapper(BackendArray):
def __init__(self, array):
def __init__(self, array, checksums=True):
self.array = array

@property
Expand All @@ -54,12 +54,10 @@ def __getitem__(self, key):

def _getitem(self, key):
result = robust_getitem(self.array, key, catch=ValueError)
# in some cases, pydap doesn't squeeze axes automatically like numpy
result = np.asarray(result)
result = np.asarray(result.data)
axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types))
if result.ndim + len(axis) != self.array.ndim and axis:
result = np.squeeze(result, axis)

return result


Expand All @@ -82,7 +80,14 @@ class PydapDataStore(AbstractDataStore):
be useful if the netCDF4 library is not available.
"""

def __init__(self, dataset, group=None):
def __init__(
self,
dataset,
group=None,
session=None,
protocol=None,
checksums=True,
):
"""
Parameters
----------
Expand All @@ -92,6 +97,8 @@ def __init__(self, dataset, group=None):
"""
self.dataset = dataset
self.group = group
self._protocol = protocol
self._checksums = checksums # true by default

@classmethod
def open(
Expand All @@ -104,6 +111,7 @@ def open(
timeout=None,
verify=None,
user_charset=None,
checksums=True,
):
from pydap.client import open_url
from pydap.net import DEFAULT_TIMEOUT
Expand All @@ -118,6 +126,7 @@ def open(
DeprecationWarning,
)
output_grid = False # new default behavior

kwargs = {
"url": url,
"application": application,
Expand All @@ -133,22 +142,37 @@ def open(
elif hasattr(url, "ds"):
# pydap dataset
dataset = url.ds
args = {"dataset": dataset}
args = {"dataset": dataset, "checksums": checksums}
if group:
# only then, change the default
args["group"] = group
if url.startswith(("http", "dap2")):
args["protocol"] = "dap2"
elif url.startswith("dap4"):
args["protocol"] = "dap4"
return cls(**args)

def open_store_variable(self, var):
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var))
try:
if hasattr(var, "dims"):
dimensions = [
dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims
]
except AttributeError:
else:
# GridType does not have a dims attribute - instead get `dimensions`
# see https://github.com/pydap/pydap/issues/485
dimensions = var.dimensions
if (
self._protocol == "dap4"
and var.name in dimensions
and hasattr(var, "dataset") # only True for pydap>3.5.5
):
var.dataset.enable_batch_mode()
data_array = self._get_data_array(var)
data = indexing.LazilyIndexedArray(data_array)
var.dataset.disable_batch_mode()
Comment on lines +168 to +171
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as an aside a with var.dataset.batch_mode(): context manager would be nice API for this

Copy link
Contributor Author

@Mikejmnez Mikejmnez Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @dcherian. That is a really good suggestion. Currently the enable_batch... method does not support the context manager protocol (it was never meant to be turned on/off). I totally see what you mean. I'll set it up (and come back to this at a later pr)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So why are we turning it off here then

Copy link
Contributor Author

@Mikejmnez Mikejmnez Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Short answer: It doesn't need it, but is there to show it is only used for dap4 dimensions.
Long answer: the scope of this PR was originally broader, and it worked beyond dimensions:

Originally I enabled an optional parameter batch=None (default). And so while dimensions were always downloaded within a single dap response (in dap4), there was the option to download other non-dimension variables in a separate individual dap response (say when executing ds.load()). With pure pydap, there is no distinction between dims and non-dims. But xarray loads eagerly dims into memory, and so I split the logic this way.

I slimmed the PR to do only dimensions, and the performance gain is enough when using xr.open_mfdataset, that I am pleased if this is merged. I'd more than gladly restore the batch=None | Iterable behavior, that further enables non-dimensions to get "batch downloaded" together, for further performance gains. The need for the optional parameter (as opposed to default for dap4) relates to best/safe practices when the remote url points to a virtually aggregated dataset (for example .ncml). In that scenario, "Batch downloading" should prob be avoided, and so this behavior needs to be optional and user-specified aware.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Short answer: It doesn't need it, but is there to show it is only used for dap4 dimensions.

I stand corrected - I ran some tests with ds.load() and in the current behavior, it does need to be disabled. I like the idea of using the context manager protocol. But it is not currently implemented.
Apologies - I have looked at this for so long, I am starting to get confused with the different iterations of this PR.

So this PR would either:

a) Stay as is: download only dims within single dap url.
b) Incorporate more general behavior (enable non-dims, original purpose of this PR).

I have no urgency on this, and my preference would be b) if that is OK with you. I could implement/enable the context manager protocol to improve the API and include it in pydap's next release

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to defer to your judgement and preferences here :)

The context manager is just a nice-to-have and not a blocker. It does sound like there's already a nice improvement. I'm happy to merge as-is at the moment

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally I recommend merging smaller, incremental changes whenever feasible. They are easier to review and improvements out into the world faster.

(There is a separate question of whether the pydap backend should be split of Xarray, given its growing complexity, but I'm also happy to defer that to another day.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally I recommend merging smaller, incremental changes whenever feasible.

Yeah - Lets merge as is. This is ready, and it will be nice to get something out in the world right now. It's been waaay too long...

(There is a separate question of whether the pydap backend should be split of Xarray, given its growing complexity, but I'm also happy to defer that to another day.)

I can see the "growing complexity" argument on maintainers / developers. Definitely a question for another day. I would be happy to be part of the conversation.

else:
# all non-dimension variables
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var))

return Variable(dimensions, data, var.attributes)

def get_variables(self):
Expand All @@ -166,6 +190,7 @@ def get_variables(self):
# check the key is not a BaseType or GridType
if not isinstance(self.ds[var], GroupType)
]

return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars)

def get_attrs(self):
Expand All @@ -177,18 +202,33 @@ def get_attrs(self):
"libdap",
"invocation",
"dimensions",
"path",
"Maps",
)
attrs = self.ds.attributes
list(map(attrs.pop, opendap_attrs, [None] * 6))
attrs = dict(self.ds.attributes)
list(map(attrs.pop, opendap_attrs, [None] * len(opendap_attrs)))
return Frozen(attrs)

def get_dimensions(self):
return Frozen(self.ds.dimensions)
return Frozen(sorted(self.ds.dimensions))

@property
def ds(self):
return get_group(self.dataset, self.group)

def _get_data_array(self, var):
"""gets dimension data all at once, storing the numpy
arrays within a cached dictionary
"""
from pydap.client import get_batch_data

if not var._is_data_loaded():
# data has not been deserialized yet
# runs only once per store/hierarchy
get_batch_data(var, checksums=self._checksums)

return self.dataset[var.id].data


class PydapBackendEntrypoint(BackendEntrypoint):
"""
Expand Down Expand Up @@ -250,6 +290,7 @@ def open_dataset(
timeout=None,
verify=None,
user_charset=None,
checksums=True,
) -> Dataset:
store = PydapDataStore.open(
url=filename_or_obj,
Expand All @@ -260,6 +301,7 @@ def open_dataset(
timeout=timeout,
verify=verify,
user_charset=user_charset,
checksums=checksums,
)
store_entrypoint = StoreBackendEntrypoint()
with close_on_error(store):
Expand Down Expand Up @@ -292,6 +334,7 @@ def open_datatree(
timeout=None,
verify=None,
user_charset=None,
checksums=True,
) -> DataTree:
groups_dict = self.open_groups_as_dict(
filename_or_obj,
Expand All @@ -303,11 +346,12 @@ def open_datatree(
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
group=group,
application=None,
session=None,
timeout=None,
verify=None,
user_charset=None,
application=application,
session=session,
timeout=timeout,
verify=verify,
user_charset=user_charset,
checksums=checksums,
)

return datatree_from_dict_with_io_cleanup(groups_dict)
Expand All @@ -329,6 +373,7 @@ def open_groups_as_dict(
timeout=None,
verify=None,
user_charset=None,
checksums=True,
) -> dict[str, Dataset]:
from xarray.core.treenode import NodePath

Expand All @@ -340,6 +385,7 @@ def open_groups_as_dict(
timeout=timeout,
verify=verify,
user_charset=user_charset,
checksums=checksums,
)

# Check for a group and make it a parent if it exists
Expand Down
40 changes: 40 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -6560,6 +6560,46 @@ def test_session(self) -> None:
)


@requires_pydap
@network
@pytest.mark.parametrize("protocol", ["dap2", "dap4"])
def test_batchdap4_downloads(tmpdir, protocol) -> None:
"""Test that in dap4, all dimensions are downloaded at once"""
import pydap
from pydap.net import create_session

_version_ = Version(pydap.__version__)
# Create a session with pre-set params in pydap backend, to cache urls
cache_name = tmpdir / "debug"
session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name})
session.cache.clear()
url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc"

ds = open_dataset(
url.replace("https", protocol),
session=session,
engine="pydap",
decode_times=False,
)

if protocol == "dap4":
if _version_ > Version("3.5.5"):
# total downloads are:
# 1 dmr + 1 dap (all dimensions at once)
assert len(session.cache.urls()) == 2
# now load the rest of the variables
ds.load()
# each non-dimension array is downloaded with an individual https requests
assert len(session.cache.urls()) == 2 + 4
else:
assert len(session.cache.urls()) == 4
ds.load()
assert len(session.cache.urls()) == 4 + 4
elif protocol == "dap2":
# das + dds + 3 dods urls for dimensions alone
assert len(session.cache.urls()) == 5


class TestEncodingInvalid:
def test_extract_nc4_variable_encoding(self) -> None:
var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"})
Expand Down
29 changes: 24 additions & 5 deletions xarray/tests/test_backends_datatree.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import numpy as np
import pytest
from packaging.version import Version

import xarray as xr
from xarray import DataTree, load_datatree, open_datatree, open_groups
Expand Down Expand Up @@ -569,8 +570,6 @@ def test_roundtrip_using_filelike_object(self, tmpdir, simple_datatree) -> None:
class TestPyDAPDatatreeIO:
"""Test PyDAP backend for DataTree."""

pytestmark = pytest.mark.xfail(reason="test.opendap.org reports a 404 error")

engine: T_DataTreeNetcdfEngine | None = "pydap"
# you can check these by adding a .dmr to urls, and replacing dap4 with http
unaligned_datatree_url = (
Expand All @@ -582,7 +581,8 @@ class TestPyDAPDatatreeIO:
simplegroup_datatree_url = "dap4://test.opendap.org/opendap/dap4/SimpleGroup.nc4.h5"

def test_open_datatree_unaligned_hierarchy(
self, url=unaligned_datatree_url
self,
url=unaligned_datatree_url,
) -> None:
with pytest.raises(
ValueError,
Expand Down Expand Up @@ -615,7 +615,7 @@ def test_open_groups(self, url=unaligned_datatree_url) -> None:
) as expected:
assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected)

def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None:
"""Test that `open_datatree` inherits coordinates from root tree.

This particular h5 file is a test file that inherits the time coordinate from the root
Expand All @@ -641,7 +641,19 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
│ Temperature (time, Z, Y, X) float32 ...
| Salinity (time, Z, Y, X) float32 ...
"""
tree = open_datatree(url, engine=self.engine)
import pydap
from pydap.net import create_session

# Create a session with pre-set retry params in pydap backend, to cache urls
cache_name = tmpdir / "debug"
session = create_session(
use_cache=True, cache_kwargs={"cache_name": cache_name}
)
session.cache.clear()

_version_ = Version(pydap.__version__)

tree = open_datatree(url, engine=self.engine, session=session)
assert set(tree.dims) == {"time", "Z", "nv"}
assert tree["/SimpleGroup"].coords["time"].dims == ("time",)
assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",)
Expand All @@ -652,6 +664,13 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None:
list(expected.dims) + ["Z", "nv"]
)

if _version_ > Version("3.5.5"):
# Total downloads are: 1 dmr, + 1 dap url for all dimensions for each group
assert len(session.cache.urls()) == 3
else:
# 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays)
assert len(session.cache.urls()) == 5

def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None:
aligned_dict_of_datasets = open_groups(url, engine=self.engine)
aligned_dt = DataTree.from_dict(aligned_dict_of_datasets)
Expand Down
Loading