|
15 | 15 | import sys |
16 | 16 | import warnings |
17 | 17 | from collections.abc import Mapping |
| 18 | +from functools import partial |
18 | 19 |
|
19 | 20 | import numpy |
20 | 21 |
|
@@ -622,6 +623,7 @@ def arrays( |
622 | 623 | interpreter="cpu", |
623 | 624 | ak_add_doc=False, |
624 | 625 | how=None, |
| 626 | + virtual=False, |
625 | 627 | # For compatibility reasons we also accepts kwargs meant for TTrees |
626 | 628 | interpretation_executor=None, |
627 | 629 | filter_branch=unset, |
@@ -677,6 +679,7 @@ def arrays( |
677 | 679 | ``list``, and ``dict``. Note that the container *type itself* |
678 | 680 | must be passed as ``how``, not an instance of that type (i.e. |
679 | 681 | ``how=tuple``, not ``how=()``). |
| 682 | + virtual (bool): If True, return virtual Awkward arrays, meaning that the data will not be loaded into memory until it is accessed. |
680 | 683 | interpretation_executor (None): This argument is not used and is only included for now |
681 | 684 | for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used |
682 | 685 | and will be removed in a future version. |
@@ -759,22 +762,40 @@ def arrays( |
759 | 762 | n_padding = self.ntuple.column_records[key_nr].first_element_index |
760 | 763 | n_padding -= cluster_starts[start_cluster_idx] |
761 | 764 | n_padding = max(n_padding, 0) |
| 765 | + dtype = None |
762 | 766 | if interpreter == "cpu": |
763 | | - content = self.ntuple.read_cluster_range( |
| 767 | + content_generator = partial( |
| 768 | + self.ntuple.read_cluster_range, |
764 | 769 | key_nr, |
765 | 770 | start_cluster_idx, |
766 | 771 | stop_cluster_idx, |
767 | 772 | missing_element_padding=n_padding, |
768 | 773 | array_cache=array_cache, |
769 | 774 | ) |
| 775 | + if virtual: |
| 776 | + total_length, _, dtype = ( |
| 777 | + self.ntuple._expected_array_length_starts_dtype( |
| 778 | + key_nr, |
| 779 | + start_cluster_idx, |
| 780 | + stop_cluster_idx, |
| 781 | + missing_element_padding=n_padding, |
| 782 | + ) |
| 783 | + ) |
| 784 | + if "cardinality" in key: |
| 785 | + total_length -= 1 |
| 786 | + content = (total_length, content_generator) |
| 787 | + else: |
| 788 | + content = content_generator() |
770 | 789 | elif interpreter == "gpu" and backend == "cuda": |
771 | 790 | content = content_dict[key_nr] |
772 | 791 | elif interpreter == "gpu": |
773 | 792 | raise NotImplementedError( |
774 | 793 | f"Backend {backend} GDS support not implemented." |
775 | 794 | ) |
| 795 | + else: |
| 796 | + raise NotImplementedError(f"Backend {backend} not implemented.") |
776 | 797 | dtype_byte = self.ntuple.column_records[key_nr].type |
777 | | - _fill_container_dict(container_dict, content, key, dtype_byte) |
| 798 | + _fill_container_dict(container_dict, content, key, dtype_byte, dtype) |
778 | 799 |
|
779 | 800 | cluster_offset = cluster_starts[start_cluster_idx] |
780 | 801 | entry_start -= cluster_offset |
@@ -1778,36 +1799,116 @@ def _cupy_insert(arr, obj, value): |
1778 | 1799 | return out |
1779 | 1800 |
|
1780 | 1801 |
|
1781 | | -def _fill_container_dict(container_dict, content, key, dtype_byte): |
1782 | | - array_library_string = uproot._util.get_array_library(content) |
| 1802 | +def _fill_container_dict(container_dict, content, key, dtype_byte, dtype): |
| 1803 | + from awkward._nplikes.numpy import Numpy |
| 1804 | + from awkward._nplikes.virtual import VirtualNDArray |
| 1805 | + |
| 1806 | + if isinstance(content, tuple): |
| 1807 | + # Virtual arrays not yet implemented for GPU |
| 1808 | + array_library_string = "numpy" |
| 1809 | + virtual = True |
| 1810 | + length = int(content[0]) |
| 1811 | + raw_generator = content[1] |
| 1812 | + else: |
| 1813 | + virtual = False |
| 1814 | + array_library_string = uproot._util.get_array_library(content) |
| 1815 | + length = len(content) |
| 1816 | + |
| 1817 | + def raw_generator(): |
| 1818 | + return content |
1783 | 1819 |
|
1784 | 1820 | library = numpy if array_library_string == "numpy" else uproot.extras.cupy() |
1785 | 1821 |
|
1786 | 1822 | if "cardinality" in key: |
1787 | | - content = library.diff(content) |
1788 | | - |
1789 | | - if "optional" in key: |
1790 | | - # We need to convert from a ListOffsetArray to an IndexedOptionArray |
1791 | | - diff = library.diff(content) |
1792 | | - missing = library.nonzero(diff == 0)[0] |
1793 | | - missing -= library.arange(len(missing), dtype=missing.dtype) |
1794 | | - dtype = "int64" if content.dtype == library.uint64 else "int32" |
1795 | | - indices = library.arange(len(content) - len(missing), dtype=dtype) |
1796 | | - if array_library_string == "numpy": |
1797 | | - indices = numpy.insert(indices, missing, -1) |
| 1823 | + |
| 1824 | + def generator(): |
| 1825 | + materialized = raw_generator() |
| 1826 | + materialized = library.diff(materialized) |
| 1827 | + return materialized |
| 1828 | + |
| 1829 | + if virtual: |
| 1830 | + virtual_array = VirtualNDArray( |
| 1831 | + Numpy.instance(), shape=(length,), dtype=dtype, generator=generator |
| 1832 | + ) |
| 1833 | + container_dict[f"{key}-data"] = virtual_array |
1798 | 1834 | else: |
1799 | | - indices = _cupy_insert(indices, missing, -1) |
1800 | | - container_dict[f"{key}-index"] = indices |
| 1835 | + container_dict[f"{key}-data"] = generator() |
| 1836 | + |
| 1837 | + elif "optional" in key: |
| 1838 | + |
| 1839 | + def generator(): |
| 1840 | + # We need to convert from a ListOffsetArray to an IndexedOptionArray |
| 1841 | + materialized = raw_generator() |
| 1842 | + diff = library.diff(materialized) |
| 1843 | + missing = library.nonzero(diff == 0)[0] |
| 1844 | + missing -= library.arange(len(missing), dtype=missing.dtype) |
| 1845 | + dtype = "int64" if materialized.dtype == library.int64 else "int32" |
| 1846 | + indices = library.arange(len(materialized) - len(missing), dtype=dtype) |
| 1847 | + if array_library_string == "numpy": |
| 1848 | + indices = numpy.insert(indices, missing, -1) |
| 1849 | + else: |
| 1850 | + indices = _cupy_insert(indices, missing, -1) |
| 1851 | + return indices[:-1] # We need to delete the last index |
| 1852 | + |
| 1853 | + if virtual: |
| 1854 | + virtual_array = VirtualNDArray( |
| 1855 | + Numpy.instance(), shape=(length - 1,), dtype=dtype, generator=generator |
| 1856 | + ) |
| 1857 | + container_dict[f"{key}-index"] = virtual_array |
| 1858 | + else: |
| 1859 | + container_dict[f"{key}-index"] = generator() |
| 1860 | + |
1801 | 1861 | elif dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]: |
1802 | | - tags = content["tag"].astype(numpy.int8) |
1803 | | - kindex = content["index"] |
1804 | | - # Find invalid variants and adjust buffers accordingly |
1805 | | - invalid = numpy.flatnonzero(tags == 0) |
1806 | | - kindex[invalid] = 0 # Might not be necessary, but safer |
1807 | | - container_dict[f"{key}-index"] = library.array(kindex) |
1808 | | - container_dict[f"{key}-tags"] = library.array(tags) |
1809 | | - container_dict["nones-index"] = library.array([-1], dtype=numpy.int64) |
| 1862 | + |
| 1863 | + def tag_generator(): |
| 1864 | + content = raw_generator() |
| 1865 | + return content["tag"].astype(numpy.int8) |
| 1866 | + |
| 1867 | + def index_generator(): |
| 1868 | + content = raw_generator() |
| 1869 | + tags = content["tag"].astype(numpy.int8) |
| 1870 | + kindex = content["index"] |
| 1871 | + # Find invalid variants and adjust buffers accordingly |
| 1872 | + invalid = numpy.flatnonzero(tags == 0) |
| 1873 | + kindex[invalid] = 0 # Might not be necessary, but safer |
| 1874 | + return kindex |
| 1875 | + |
| 1876 | + def nones_index_generator(): |
| 1877 | + return library.array([-1], dtype=numpy.int64) |
| 1878 | + |
| 1879 | + if virtual: |
| 1880 | + tag_virtual_array = VirtualNDArray( |
| 1881 | + Numpy.instance(), |
| 1882 | + shape=(length,), |
| 1883 | + dtype=numpy.int8, |
| 1884 | + generator=tag_generator, |
| 1885 | + ) |
| 1886 | + container_dict[f"{key}-tags"] = tag_virtual_array |
| 1887 | + index_virtual_array = VirtualNDArray( |
| 1888 | + Numpy.instance(), |
| 1889 | + shape=(length,), |
| 1890 | + dtype=numpy.int64, |
| 1891 | + generator=index_generator, |
| 1892 | + ) |
| 1893 | + container_dict[f"{key}-index"] = index_virtual_array |
| 1894 | + nones_index_virtual_array = VirtualNDArray( |
| 1895 | + Numpy.instance(), |
| 1896 | + shape=(1,), |
| 1897 | + dtype=numpy.int64, |
| 1898 | + generator=nones_index_generator, |
| 1899 | + ) |
| 1900 | + container_dict["nones-index"] = nones_index_virtual_array |
| 1901 | + else: |
| 1902 | + container_dict[f"{key}-tags"] = tag_generator() |
| 1903 | + container_dict[f"{key}-index"] = index_generator() |
| 1904 | + container_dict["nones-index"] = nones_index_generator() |
1810 | 1905 | else: |
1811 | | - # don't distinguish data and offsets |
1812 | | - container_dict[f"{key}-data"] = content |
1813 | | - container_dict[f"{key}-offsets"] = content |
| 1906 | + if virtual: |
| 1907 | + virtual_array = VirtualNDArray( |
| 1908 | + Numpy.instance(), shape=(length,), dtype=dtype, generator=raw_generator |
| 1909 | + ) |
| 1910 | + container_dict[f"{key}-data"] = virtual_array |
| 1911 | + container_dict[f"{key}-offsets"] = virtual_array |
| 1912 | + else: |
| 1913 | + container_dict[f"{key}-data"] = content |
| 1914 | + container_dict[f"{key}-offsets"] = content |
0 commit comments