rapidsai
diff --git a/‎python/cudf_polars/cudf_polars/containers/column.py‎
Lines changed: 61 additions & 32 deletions b/‎python/cudf_polars/cudf_polars/containers/column.py‎
Lines changed: 61 additions & 32 deletions
diff --git a/‎python/cudf_polars/cudf_polars/containers/dataframe.py‎
Lines changed: 8 additions & 1 deletion b/‎python/cudf_polars/cudf_polars/containers/dataframe.py‎
Lines changed: 8 additions & 1 deletion
@@ -5,7 +5,6 @@
 
 from __future__ import annotations
 
-import functools
 from typing import TYPE_CHECKING
 
 import polars as pl
@@ -28,6 +27,8 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+    from rmm.pylibrmm.stream import Stream
+
     from cudf_polars.typing import (
         ColumnHeader,
         ColumnOptions,
@@ -82,6 +83,8 @@ def __init__(
         self.name = name
         self.dtype = dtype
         self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
+        self._nan_count: int | None = None
+        self._obj_scalar: plc.Scalar | None = None
 
     @classmethod
     def deserialize(
@@ -126,6 +129,7 @@ def deserialize_ctor_kwargs(
 
     def serialize(
         self,
+        stream: Stream,
     ) -> tuple[ColumnHeader, tuple[memoryview[bytes], plc.gpumemoryview]]:
         """
         Serialize the Column into header and frames.
@@ -145,7 +149,7 @@ def serialize(
         frames
             Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
         """
-        packed = plc.contiguous_split.pack(plc.Table([self.obj]))
+        packed = plc.contiguous_split.pack(plc.Table([self.obj]), stream=stream)
         header: ColumnHeader = {
             "column_kwargs": self.serialize_ctor_kwargs(),
             "frame_count": 2,
@@ -162,8 +166,7 @@ def serialize_ctor_kwargs(self) -> ColumnOptions:
             "dtype": pl.polars.dtype_str_repr(self.dtype.polars_type),
         }
 
-    @functools.cached_property
-    def obj_scalar(self) -> plc.Scalar:
+    def obj_scalar(self, stream: Stream) -> plc.Scalar:
         """
         A copy of the column object as a pylibcudf Scalar.
 
@@ -178,7 +181,9 @@ def obj_scalar(self) -> plc.Scalar:
         """
         if not self.is_scalar:
             raise ValueError(f"Cannot convert a column of length {self.size} to scalar")
-        return plc.copying.get_element(self.obj, 0)
+        if self._obj_scalar is None:
+            self._obj_scalar = plc.copying.get_element(self.obj, 0, stream=stream)
+        return self._obj_scalar
 
     def rename(self, name: str | None, /) -> Self:
         """
@@ -228,6 +233,7 @@ def check_sorted(
         *,
         order: plc.types.Order,
         null_order: plc.types.NullOrder,
+        stream: Stream,
     ) -> bool:
         """
         Check if the column is sorted.
@@ -238,6 +244,9 @@ def check_sorted(
             The requested sort order.
         null_order
             Where nulls sort to.
+        stream
+            CUDA stream used for device memory operations and kernel launches
+            on this dataframe. The data in ``self.obj`` must be valid on this stream.
 
         Returns
         -------
@@ -254,21 +263,26 @@ def check_sorted(
             return self.order == order and (
                 self.null_count == 0 or self.null_order == null_order
             )
-        if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
+        if plc.sorting.is_sorted(
+            plc.Table([self.obj]), [order], [null_order], stream=stream
+        ):
             self.sorted = plc.types.Sorted.YES
             self.order = order
             self.null_order = null_order
             return True
         return False
 
-    def astype(self, dtype: DataType) -> Column:
+    def astype(self, dtype: DataType, stream: Stream) -> Column:
         """
         Cast the column to as the requested dtype.
 
         Parameters
         ----------
         dtype
             Datatype to cast to.
+        stream
+            CUDA stream used for device memory operations and kernel launches
+            on this dataframe. The data in ``self.obj`` must be valid on this stream.
 
         Returns
         -------
@@ -292,11 +306,15 @@ def astype(self, dtype: DataType) -> Column:
             plc_dtype.id() == plc.TypeId.STRING
             or self.obj.type().id() == plc.TypeId.STRING
         ):
-            return Column(self._handle_string_cast(plc_dtype), dtype=dtype)
+            return Column(
+                self._handle_string_cast(plc_dtype, stream=stream), dtype=dtype
+            )
         elif plc.traits.is_integral_not_bool(
             self.obj.type()
         ) and plc.traits.is_timestamp(plc_dtype):
-            upcasted = plc.unary.cast(self.obj, plc.DataType(plc.TypeId.INT64))
+            upcasted = plc.unary.cast(
+                self.obj, plc.DataType(plc.TypeId.INT64), stream=stream
+            )
             plc_col = plc.column.Column(
                 plc_dtype,
                 upcasted.size(),
@@ -319,40 +337,44 @@ def astype(self, dtype: DataType) -> Column:
                 self.obj.offset(),
                 self.obj.children(),
             )
-            return Column(plc.unary.cast(plc_col, plc_dtype), dtype=dtype).sorted_like(
-                self
-            )
+            return Column(
+                plc.unary.cast(plc_col, plc_dtype, stream=stream), dtype=dtype
+            ).sorted_like(self)
         else:
-            result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype)
+            result = Column(
+                plc.unary.cast(self.obj, plc_dtype, stream=stream), dtype=dtype
+            )
             if is_order_preserving_cast(self.obj.type(), plc_dtype):
                 return result.sorted_like(self)
             return result
 
-    def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column:
+    def _handle_string_cast(self, dtype: plc.DataType, stream: Stream) -> plc.Column:
         if dtype.id() == plc.TypeId.STRING:
             if is_floating_point(self.obj.type()):
-                return from_floats(self.obj)
+                return from_floats(self.obj, stream=stream)
             else:
-                return from_integers(self.obj)
+                return from_integers(self.obj, stream=stream)
         else:
             if is_floating_point(dtype):
-                floats = is_float(self.obj)
+                floats = is_float(self.obj, stream=stream)
                 if not plc.reduce.reduce(
                     floats,
                     plc.aggregation.all(),
                     plc.DataType(plc.TypeId.BOOL8),
+                    stream=stream,
                 ).to_py():
                     raise InvalidOperationError("Conversion from `str` failed.")
                 return to_floats(self.obj, dtype)
             else:
-                integers = is_integer(self.obj)
+                integers = is_integer(self.obj, stream=stream)
                 if not plc.reduce.reduce(
                     integers,
                     plc.aggregation.all(),
                     plc.DataType(plc.TypeId.BOOL8),
+                    stream=stream,
                 ).to_py():
                     raise InvalidOperationError("Conversion from `str` failed.")
-                return to_integers(self.obj, dtype)
+                return to_integers(self.obj, dtype, stream=stream)
 
     def copy_metadata(self, from_: pl.Series, /) -> Self:
         """
@@ -439,28 +461,31 @@ def copy(self) -> Self:
             dtype=self.dtype,
         )
 
-    def mask_nans(self) -> Self:
+    def mask_nans(self, stream: Stream) -> Self:
         """Return a shallow copy of self with nans masked out."""
         if plc.traits.is_floating_point(self.obj.type()):
             old_count = self.null_count
-            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            mask, new_count = plc.transform.nans_to_nulls(self.obj, stream=stream)
             result = type(self)(self.obj.with_mask(mask, new_count), self.dtype)
             if old_count == new_count:
                 return result.sorted_like(self)
             return result
         return self.copy()
 
-    @functools.cached_property
-    def nan_count(self) -> int:
+    def nan_count(self, stream: Stream) -> int:
         """Return the number of NaN values in the column."""
-        if self.size > 0 and plc.traits.is_floating_point(self.obj.type()):
-            # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
-            return plc.reduce.reduce(
-                plc.unary.is_nan(self.obj),
-                plc.aggregation.sum(),
-                plc.types.SIZE_TYPE,
-            ).to_py()  # type: ignore[return-value]
-        return 0
+        if self._nan_count is None:
+            if self.size > 0 and plc.traits.is_floating_point(self.obj.type()):
+                # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
+                self._nan_count = plc.reduce.reduce(  # type: ignore[assignment]
+                    plc.unary.is_nan(self.obj, stream),
+                    plc.aggregation.sum(),
+                    plc.types.SIZE_TYPE,
+                    stream=stream,
+                ).to_py()
+            else:
+                self._nan_count = 0
+        return self._nan_count  # type: ignore[return-value]
 
     @property
     def size(self) -> int:
@@ -472,7 +497,7 @@ def null_count(self) -> int:
         """Return the number of Null values in the column."""
         return self.obj.null_count()
 
-    def slice(self, zlice: Slice | None) -> Self:
+    def slice(self, zlice: Slice | None, stream: Stream) -> Self:
         """
         Slice a column.
 
@@ -481,6 +506,9 @@ def slice(self, zlice: Slice | None) -> Self:
         zlice
             optional, tuple of start and length, negative values of start
             treated as for python indexing. If not provided, returns self.
+        stream
+            CUDA stream used for device memory operations and kernel launches
+            on this dataframe. The data in ``self.obj`` must be valid on this stream.
 
         Returns
         -------
@@ -491,6 +519,7 @@ def slice(self, zlice: Slice | None) -> Self:
         (table,) = plc.copying.slice(
             plc.Table([self.obj]),
             conversion.from_polars_slice(zlice, num_rows=self.size),
+            stream=stream,
         )
         (column,) = table.columns()
         return type(self)(column, name=self.name, dtype=self.dtype).sorted_like(self)
@@ -252,6 +252,7 @@ def deserialize(
 
     def serialize(
         self,
+        stream: Stream | None = None,
     ) -> tuple[DataFrameHeader, tuple[memoryview[bytes], plc.gpumemoryview]]:
         """
         Serialize the table into header and frames.
@@ -264,14 +265,20 @@ def serialize(
             >>> from cudf_polars.experimental.dask_serialize import register
             >>> register()
 
+        Parameters
+        ----------
+        stream
+            CUDA stream used for device memory operations and kernel launches
+            on this dataframe.
+
         Returns
         -------
         header
             A dict containing any picklable metadata required to reconstruct the object.
         frames
             Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
         """
-        packed = plc.contiguous_split.pack(self.table, stream=self.stream)
+        packed = plc.contiguous_split.pack(self.table, stream=stream)
 
         # Keyword arguments for `Column.__init__`.
         columns_kwargs: list[ColumnOptions] = [