Skip to content

Commit 5dc521d

Browse files
committed
First working draft of virtual arrays
1 parent b043010 commit 5dc521d

File tree

2 files changed

+134
-31
lines changed

2 files changed

+134
-31
lines changed

src/uproot/behaviors/RNTuple.py

Lines changed: 129 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import sys
1616
import warnings
1717
from collections.abc import Mapping
18+
from functools import partial
1819

1920
import numpy
2021

@@ -622,6 +623,7 @@ def arrays(
622623
interpreter="cpu",
623624
ak_add_doc=False,
624625
how=None,
626+
virtual=False,
625627
# For compatibility reasons we also accepts kwargs meant for TTrees
626628
interpretation_executor=None,
627629
filter_branch=unset,
@@ -677,6 +679,7 @@ def arrays(
677679
``list``, and ``dict``. Note that the container *type itself*
678680
must be passed as ``how``, not an instance of that type (i.e.
679681
``how=tuple``, not ``how=()``).
682+
virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed.
680683
interpretation_executor (None): This argument is not used and is only included for now
681684
for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used
682685
and will be removed in a future version.
@@ -759,22 +762,40 @@ def arrays(
759762
n_padding = self.ntuple.column_records[key_nr].first_element_index
760763
n_padding -= cluster_starts[start_cluster_idx]
761764
n_padding = max(n_padding, 0)
765+
dtype = None
762766
if interpreter == "cpu":
763-
content = self.ntuple.read_cluster_range(
767+
content_generator = partial(
768+
self.ntuple.read_cluster_range,
764769
key_nr,
765770
start_cluster_idx,
766771
stop_cluster_idx,
767772
missing_element_padding=n_padding,
768773
array_cache=array_cache,
769774
)
775+
if virtual:
776+
total_length, _, dtype = (
777+
self.ntuple._expected_array_length_starts_dtype(
778+
key_nr,
779+
start_cluster_idx,
780+
stop_cluster_idx,
781+
missing_element_padding=n_padding,
782+
)
783+
)
784+
if "cardinality" in key:
785+
total_length -= 1
786+
content = (total_length, content_generator)
787+
else:
788+
content = content_generator()
770789
elif interpreter == "gpu" and backend == "cuda":
771790
content = content_dict[key_nr]
772791
elif interpreter == "gpu":
773792
raise NotImplementedError(
774793
f"Backend {backend} GDS support not implemented."
775794
)
795+
else:
796+
raise NotImplementedError(f"Backend {backend} not implemented.")
776797
dtype_byte = self.ntuple.column_records[key_nr].type
777-
_fill_container_dict(container_dict, content, key, dtype_byte)
798+
_fill_container_dict(container_dict, content, key, dtype_byte, dtype)
778799

779800
cluster_offset = cluster_starts[start_cluster_idx]
780801
entry_start -= cluster_offset
@@ -1778,36 +1799,116 @@ def _cupy_insert(arr, obj, value):
17781799
return out
17791800

17801801

1781-
def _fill_container_dict(container_dict, content, key, dtype_byte):
1782-
array_library_string = uproot._util.get_array_library(content)
1802+
def _fill_container_dict(container_dict, content, key, dtype_byte, dtype):
1803+
from awkward._nplikes.numpy import Numpy
1804+
from awkward._nplikes.virtual import VirtualNDArray
1805+
1806+
if isinstance(content, tuple):
1807+
# Virtual arrays not yet implemented for GPU
1808+
array_library_string = "numpy"
1809+
virtual = True
1810+
length = int(content[0])
1811+
raw_generator = content[1]
1812+
else:
1813+
virtual = False
1814+
array_library_string = uproot._util.get_array_library(content)
1815+
length = len(content)
1816+
1817+
def raw_generator():
1818+
return content
17831819

17841820
library = numpy if array_library_string == "numpy" else uproot.extras.cupy()
17851821

17861822
if "cardinality" in key:
1787-
content = library.diff(content)
1788-
1789-
if "optional" in key:
1790-
# We need to convert from a ListOffsetArray to an IndexedOptionArray
1791-
diff = library.diff(content)
1792-
missing = library.nonzero(diff == 0)[0]
1793-
missing -= library.arange(len(missing), dtype=missing.dtype)
1794-
dtype = "int64" if content.dtype == library.uint64 else "int32"
1795-
indices = library.arange(len(content) - len(missing), dtype=dtype)
1796-
if array_library_string == "numpy":
1797-
indices = numpy.insert(indices, missing, -1)
1823+
1824+
def generator():
1825+
materialized = raw_generator()
1826+
materialized = library.diff(materialized)
1827+
return materialized
1828+
1829+
if virtual:
1830+
virtual_array = VirtualNDArray(
1831+
Numpy.instance(), shape=(length,), dtype=dtype, generator=generator
1832+
)
1833+
container_dict[f"{key}-data"] = virtual_array
17981834
else:
1799-
indices = _cupy_insert(indices, missing, -1)
1800-
container_dict[f"{key}-index"] = indices
1835+
container_dict[f"{key}-data"] = generator()
1836+
1837+
elif "optional" in key:
1838+
1839+
def generator():
1840+
# We need to convert from a ListOffsetArray to an IndexedOptionArray
1841+
materialized = raw_generator()
1842+
diff = library.diff(materialized)
1843+
missing = library.nonzero(diff == 0)[0]
1844+
missing -= library.arange(len(missing), dtype=missing.dtype)
1845+
dtype = "int64" if materialized.dtype == library.int64 else "int32"
1846+
indices = library.arange(len(materialized) - len(missing), dtype=dtype)
1847+
if array_library_string == "numpy":
1848+
indices = numpy.insert(indices, missing, -1)
1849+
else:
1850+
indices = _cupy_insert(indices, missing, -1)
1851+
return indices[:-1] # We need to delete the last index
1852+
1853+
if virtual:
1854+
virtual_array = VirtualNDArray(
1855+
Numpy.instance(), shape=(length - 1,), dtype=dtype, generator=generator
1856+
)
1857+
container_dict[f"{key}-index"] = virtual_array
1858+
else:
1859+
container_dict[f"{key}-index"] = generator()
1860+
18011861
elif dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]:
1802-
tags = content["tag"].astype(numpy.int8)
1803-
kindex = content["index"]
1804-
# Find invalid variants and adjust buffers accordingly
1805-
invalid = numpy.flatnonzero(tags == 0)
1806-
kindex[invalid] = 0 # Might not be necessary, but safer
1807-
container_dict[f"{key}-index"] = library.array(kindex)
1808-
container_dict[f"{key}-tags"] = library.array(tags)
1809-
container_dict["nones-index"] = library.array([-1], dtype=numpy.int64)
1862+
1863+
def tag_generator():
1864+
content = raw_generator()
1865+
return content["tag"].astype(numpy.int8)
1866+
1867+
def index_generator():
1868+
content = raw_generator()
1869+
tags = content["tag"].astype(numpy.int8)
1870+
kindex = content["index"]
1871+
# Find invalid variants and adjust buffers accordingly
1872+
invalid = numpy.flatnonzero(tags == 0)
1873+
kindex[invalid] = 0 # Might not be necessary, but safer
1874+
return kindex
1875+
1876+
def nones_index_generator():
1877+
return library.array([-1], dtype=numpy.int64)
1878+
1879+
if virtual:
1880+
tag_virtual_array = VirtualNDArray(
1881+
Numpy.instance(),
1882+
shape=(length,),
1883+
dtype=numpy.int8,
1884+
generator=tag_generator,
1885+
)
1886+
container_dict[f"{key}-tags"] = tag_virtual_array
1887+
index_virtual_array = VirtualNDArray(
1888+
Numpy.instance(),
1889+
shape=(length,),
1890+
dtype=numpy.int64,
1891+
generator=index_generator,
1892+
)
1893+
container_dict[f"{key}-index"] = index_virtual_array
1894+
nones_index_virtual_array = VirtualNDArray(
1895+
Numpy.instance(),
1896+
shape=(1,),
1897+
dtype=numpy.int64,
1898+
generator=nones_index_generator,
1899+
)
1900+
container_dict["nones-index"] = nones_index_virtual_array
1901+
else:
1902+
container_dict[f"{key}-tags"] = tag_generator()
1903+
container_dict[f"{key}-index"] = index_generator()
1904+
container_dict["nones-index"] = nones_index_generator()
18101905
else:
1811-
# don't distinguish data and offsets
1812-
container_dict[f"{key}-data"] = content
1813-
container_dict[f"{key}-offsets"] = content
1906+
if virtual:
1907+
virtual_array = VirtualNDArray(
1908+
Numpy.instance(), shape=(length,), dtype=dtype, generator=raw_generator
1909+
)
1910+
container_dict[f"{key}-data"] = virtual_array
1911+
container_dict[f"{key}-offsets"] = virtual_array
1912+
else:
1913+
container_dict[f"{key}-data"] = content
1914+
container_dict[f"{key}-offsets"] = content

src/uproot/models/RNTuple.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -711,10 +711,10 @@ def read_cluster_range(
711711
Returns a numpy array with the data from the column.
712712
"""
713713
field_metadata = self.get_field_metadata(col_idx)
714-
total_length, starts, _ = self._expected_array_length_starts_dtype(
714+
total_length, starts, dtype = self._expected_array_length_starts_dtype(
715715
col_idx, cluster_start, cluster_stop, missing_element_padding
716716
)
717-
res = numpy.empty(total_length, field_metadata.dtype_result)
717+
res = numpy.empty(total_length, dtype)
718718
# Initialize the padding elements. Note that it might be different from missing_element_padding
719719
# because for offsets there is an extra zero added at the start.
720720
assert len(starts) > 0, "The cluster range is invalid"
@@ -726,7 +726,7 @@ def read_cluster_range(
726726
cluster_idx,
727727
col_idx,
728728
field_metadata,
729-
destination=res[starts[i] : stop],
729+
destination=res[starts[i] : stop].view(field_metadata.dtype),
730730
array_cache=array_cache,
731731
)
732732

@@ -1169,6 +1169,8 @@ def get_field_metadata(self, ncol):
11691169
"std::string"
11701170
):
11711171
dtype_result = dtype
1172+
elif dtype_byte in uproot.const.rntuple_custom_float_types:
1173+
dtype_result = numpy.float32
11721174
else:
11731175
dtype_result = numpy.result_type(*alt_dtype_list)
11741176
field_metadata = FieldClusterMetadata(

0 commit comments

Comments
 (0)