From 0f29529da946cb662b082c9e34cfdcc8b9d88b74 Mon Sep 17 00:00:00 2001 From: jemmajeffree <98864717+jemmajeffree@users.noreply.github.com> Date: Fri, 18 Jul 2025 16:09:04 +1000 Subject: [PATCH 01/15] update to nanvar to use more stable algorithm if engine is flox --- flox/__init__.py | 2 - flox/_version.py | 1 + flox/aggregate_flox.py | 116 ++++++++++++++++++++++++++++++++++++++++- flox/aggregations.py | 71 +++++++++++++++++++++---- 4 files changed, 178 insertions(+), 12 deletions(-) create mode 100644 flox/_version.py diff --git a/flox/__init__.py b/flox/__init__.py index 898c10e24..2ed80afc5 100644 --- a/flox/__init__.py +++ b/flox/__init__.py @@ -13,7 +13,6 @@ ReindexArrayType, ) # noqa - def _get_version(): __version__ = "999" try: @@ -22,5 +21,4 @@ def _get_version(): pass return __version__ - __version__ = _get_version() diff --git a/flox/_version.py b/flox/_version.py new file mode 100644 index 000000000..5b33c8a46 --- /dev/null +++ b/flox/_version.py @@ -0,0 +1 @@ +__version__ = "0.1.dev657+g619a390.d20250606" \ No newline at end of file diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index e470151a4..ea1aa2c1e 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -5,6 +5,86 @@ from . import xrdtypes as dtypes from .xrutils import is_scalar, isnull, notnull +MULTIARRAY_HANDLED_FUNCTIONS = {} + +class MultiArray: + arrays: tuple[np.ndarray, ...] + + def __init__(self, arrays): + self.arrays = arrays # something else needed here to be more careful about types (not sure what) + # Do we want to co-erce arrays into a tuple and make sure it's immutable? Do we want it to be immutable? + assert np.all([arrays[0].shape == a.shape for a in arrays]), 'Expect all arrays to have the same shape' + + def astype(self,dt,**kwargs): + new_arrays = [] # I really don't like doing this as a list + for array in self.arrays: # Do we care about trying to avoid for loops here? three separate lines would be faster, but harder to read + new_arrays.append(array.astype(dt,**kwargs)) + return MultiArray(new_arrays) + + def reshape(self,shape,**kwargs): + return MultiArray([array.reshape(shape,**kwargs) for array in self.arrays]) + + def squeeze(self,axis=None): + return MultiArray([array.squeeze(axis) for array in self.arrays]) + + def __array_function__(self, func, types, args, kwargs): + if func not in MULTIARRAY_HANDLED_FUNCTIONS: + return NotImplemented + # Note: this allows subclasses that don't override + # __array_function__ to handle MyArray objects + #if not all(issubclass(t, MyArray) for t in types): # I can't see this being relevant at all for this code, but maybe it's safer to leave it in? + #return NotImplemented + return MULTIARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs) + + + # Shape is needed, seems likely that the other two might be + # Making some strong assumptions here that all the arrays are the same shape, and I don't really like this + @property + def dtype(self) -> np.dtype: + return self.arrays[0].dtype + + @property + def shape(self) -> tuple[int, ...]: + return self.arrays[0].shape + + @property + def ndim(self) -> int: + return self.arrays[0].ndim + +def implements(numpy_function): + """Register an __array_function__ implementation for MyArray objects.""" + def decorator(func): + MULTIARRAY_HANDLED_FUNCTIONS[numpy_function] = func + return func + return decorator + +@implements(np.expand_dims) +def expand_dims_MultiArray(multiarray,axis): + return MultiArray([np.expand_dims(a,axis) for a in multiarray.arrays]) #This is gonna spit out a list and I'm not sure if I'm okay with that? + +@implements(np.concatenate) +def concatenate_MultiArray(multiarrays,axis): + + n_arrays = len(multiarrays[0].arrays) + for ma in multiarrays[1:]: + if not (len(ma.arrays) == n_arrays): # I don't know what trying to concatenate MultiArrays with different numbers of arrays would even mean + raise NotImplementedError + + # There's the potential for problematic different shapes coming in here. + # Probably warrants some defensive programming, but I'm not sure what to check for while still being generic + + # I don't like using append and lists here, but I can't work out how to do it better + new_arrays = [] + for i in range(multiarrays[0].ndim): + new_arrays.append(np.concatenate([ma.arrays[i] for ma in multiarrays],axis)) + + out = MultiArray(new_arrays) + return out + +@implements(np.transpose) +def transpose_MultiArray(multiarray,axes): + return MultiArray([np.transpose(a,axes) for a in multiarray.arrays]) #This is gonna spit out a list and I'm not sure if I'm okay with that? + def _prepare_for_flox(group_idx, array): """ @@ -206,7 +286,6 @@ def _nan_grouped_op(group_idx, array, func, fillna, *args, **kwargs): nanmedian = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_, skipna=True)) # TODO: all, any - def sum_of_squares(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): return sum( group_idx, @@ -250,6 +329,41 @@ def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) return out +def var_chunk(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): + # Calculate length and sum - important for the adjustment terms to sum squared deviations + array_lens = nanlen( + group_idx, + array, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + array_sums = sum( + group_idx, + array, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + # Calculate sum squared deviations - the main part of variance sum + array_means = array_sums/array_lens # Does this risk being run eagerly because it's not wrapped in anything? + + sum_squared_deviations = sum( + group_idx, + (array-array_means[..., group_idx])**2, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + return MultiArray((sum_squared_deviations,array_sums,array_lens)) + + def ffill(group_idx, array, *, axis, **kwargs): group_idx, array, perm = _prepare_for_flox(group_idx, array) diff --git a/flox/aggregations.py b/flox/aggregations.py index 6182d6bdd..56aec6a5d 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -342,14 +342,57 @@ def _mean_finalize(sum_, count): final_dtype=np.floating, ) - +def _var_combine(array, axis, keepdims=True): + + def clip_last(array): + '''Return array except the last element along axis + Purely included to tidy up the adj_terms line + ''' + not_last = [slice(None,None) for i in range(array.ndim)] + not_last[axis[0]] = slice(None,-1) + return array[*not_last] + def clip_first(array): + '''Return array except the first element along axis + Purely included to tidy up the adj_terms line + ''' + not_first = [slice(None,None) for i in range(array.ndim)] + not_first[axis[0]] = slice(1,None) + return array[*not_first] + + assert len(axis)==1, "Assuming that the combine function is only in one direction at once" + + # Does this double our memory footprint or are they just views? + # If there's a huge memory impact, probably better to copy paste array.arrays[1] + # in and accept the hit to readability + sum_deviations = array.arrays[0] + sum_X = array.arrays[1] + sum_len = array.arrays[2] + + # Calculate parts needed for cascading combination + cumsum_X = np.cumsum(sum_X,axis=axis[0]) #Don't need to be able to merge the last element + cumsum_len = np.cumsum(sum_len,axis=axis[0]) + + + # Adjustment terms to tweak the sum of squared deviations because not every chunk has the same mean + adj_terms = ((clip_last(cumsum_len)*clip_first(sum_X)-clip_first(sum_len)*clip_last(cumsum_X))**2/ + (clip_last(cumsum_len)*clip_first(sum_len)*(clip_last(cumsum_len)+clip_first(sum_len)))) + + + + return aggregate_flox.MultiArray((np.sum(sum_deviations,axis=axis,keepdims=keepdims)+np.sum(adj_terms,axis=axis,keepdims=keepdims), # sum of squared deviations + np.sum(sum_X,axis=axis,keepdims=keepdims), # sum of array items + np.sum(sum_len,axis=axis,keepdims=keepdims), # sum of array lengths + ))# I'm not even pretending calling this class from there is a good idea, I think it wants to be somewhere else though + # TODO: fix this for complex numbers -def _var_finalize(sumsq, sum_, count, ddof=0): - with np.errstate(invalid="ignore", divide="ignore"): - result = (sumsq - (sum_**2 / count)) / (count - ddof) - result[count <= ddof] = np.nan - return result +#def _var_finalize(sumsq, sum_, count, ddof=0): + #with np.errstate(invalid="ignore", divide="ignore"): + #result = (sumsq - (sum_**2 / count)) / (count - ddof) + #result[count <= ddof] = np.nan + #return result +def _var_finalize(multiarray,ddof=0): + return multiarray.arrays[0]/(multiarray.arrays[2]-ddof) # Is this how ddof works again??? def _std_finalize(sumsq, sum_, count, ddof=0): return np.sqrt(_var_finalize(sumsq, sum_, count, ddof)) @@ -366,14 +409,24 @@ def _std_finalize(sumsq, sum_, count, ddof=0): dtypes=(None, None, np.intp), final_dtype=np.floating, ) +#nanvar = Aggregation( + #"nanvar", + #chunk=("nansum_of_squares", "nansum", "nanlen"), + #combine=("sum", "sum", "sum"), + #finalize=_var_finalize, + #fill_value=0, + #final_fill_value=np.nan, + #dtypes=(None, None, np.intp), + #final_dtype=np.floating, +#) nanvar = Aggregation( "nanvar", - chunk=("nansum_of_squares", "nansum", "nanlen"), - combine=("sum", "sum", "sum"), + chunk=("var_chunk"), + combine=(_var_combine,), finalize=_var_finalize, fill_value=0, final_fill_value=np.nan, - dtypes=(None, None, np.intp), + dtypes=(None, ), final_dtype=np.floating, ) std = Aggregation( From 1fbf5f85a192c2cfd322f7095ed724b6b6112689 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Jul 2025 06:26:35 +0000 Subject: [PATCH 02/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flox/__init__.py | 2 + flox/_version.py | 2 +- flox/aggregate_flox.py | 96 ++++++++++++++++++++++++------------------ flox/aggregations.py | 94 ++++++++++++++++++++++------------------- 4 files changed, 109 insertions(+), 85 deletions(-) diff --git a/flox/__init__.py b/flox/__init__.py index 2ed80afc5..898c10e24 100644 --- a/flox/__init__.py +++ b/flox/__init__.py @@ -13,6 +13,7 @@ ReindexArrayType, ) # noqa + def _get_version(): __version__ = "999" try: @@ -21,4 +22,5 @@ def _get_version(): pass return __version__ + __version__ = _get_version() diff --git a/flox/_version.py b/flox/_version.py index 5b33c8a46..a76cf84b5 100644 --- a/flox/_version.py +++ b/flox/_version.py @@ -1 +1 @@ -__version__ = "0.1.dev657+g619a390.d20250606" \ No newline at end of file +__version__ = "0.1.dev657+g619a390.d20250606" diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index ea1aa2c1e..d5f0084c2 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -7,38 +7,40 @@ MULTIARRAY_HANDLED_FUNCTIONS = {} + class MultiArray: arrays: tuple[np.ndarray, ...] def __init__(self, arrays): - self.arrays = arrays # something else needed here to be more careful about types (not sure what) + self.arrays = arrays # something else needed here to be more careful about types (not sure what) # Do we want to co-erce arrays into a tuple and make sure it's immutable? Do we want it to be immutable? - assert np.all([arrays[0].shape == a.shape for a in arrays]), 'Expect all arrays to have the same shape' - - def astype(self,dt,**kwargs): - new_arrays = [] # I really don't like doing this as a list - for array in self.arrays: # Do we care about trying to avoid for loops here? three separate lines would be faster, but harder to read - new_arrays.append(array.astype(dt,**kwargs)) + assert np.all([arrays[0].shape == a.shape for a in arrays]), ( + "Expect all arrays to have the same shape" + ) + + def astype(self, dt, **kwargs): + new_arrays = [] # I really don't like doing this as a list + for array in self.arrays: # Do we care about trying to avoid for loops here? three separate lines would be faster, but harder to read + new_arrays.append(array.astype(dt, **kwargs)) return MultiArray(new_arrays) - - def reshape(self,shape,**kwargs): - return MultiArray([array.reshape(shape,**kwargs) for array in self.arrays]) - - def squeeze(self,axis=None): - return MultiArray([array.squeeze(axis) for array in self.arrays]) - + + def reshape(self, shape, **kwargs): + return MultiArray([array.reshape(shape, **kwargs) for array in self.arrays]) + + def squeeze(self, axis=None): + return MultiArray([array.squeeze(axis) for array in self.arrays]) + def __array_function__(self, func, types, args, kwargs): if func not in MULTIARRAY_HANDLED_FUNCTIONS: return NotImplemented # Note: this allows subclasses that don't override # __array_function__ to handle MyArray objects - #if not all(issubclass(t, MyArray) for t in types): # I can't see this being relevant at all for this code, but maybe it's safer to leave it in? - #return NotImplemented - return MULTIARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs) + # if not all(issubclass(t, MyArray) for t in types): # I can't see this being relevant at all for this code, but maybe it's safer to leave it in? + # return NotImplemented + return MULTIARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs) - - # Shape is needed, seems likely that the other two might be - # Making some strong assumptions here that all the arrays are the same shape, and I don't really like this + # Shape is needed, seems likely that the other two might be + # Making some strong assumptions here that all the arrays are the same shape, and I don't really like this @property def dtype(self) -> np.dtype: return self.arrays[0].dtype @@ -51,39 +53,50 @@ def shape(self) -> tuple[int, ...]: def ndim(self) -> int: return self.arrays[0].ndim + def implements(numpy_function): """Register an __array_function__ implementation for MyArray objects.""" + def decorator(func): MULTIARRAY_HANDLED_FUNCTIONS[numpy_function] = func return func + return decorator + @implements(np.expand_dims) -def expand_dims_MultiArray(multiarray,axis): - return MultiArray([np.expand_dims(a,axis) for a in multiarray.arrays]) #This is gonna spit out a list and I'm not sure if I'm okay with that? +def expand_dims_MultiArray(multiarray, axis): + return MultiArray( + [np.expand_dims(a, axis) for a in multiarray.arrays] + ) # This is gonna spit out a list and I'm not sure if I'm okay with that? + @implements(np.concatenate) -def concatenate_MultiArray(multiarrays,axis): - +def concatenate_MultiArray(multiarrays, axis): n_arrays = len(multiarrays[0].arrays) for ma in multiarrays[1:]: - if not (len(ma.arrays) == n_arrays): # I don't know what trying to concatenate MultiArrays with different numbers of arrays would even mean + if not ( + len(ma.arrays) == n_arrays + ): # I don't know what trying to concatenate MultiArrays with different numbers of arrays would even mean raise NotImplementedError - - # There's the potential for problematic different shapes coming in here. + + # There's the potential for problematic different shapes coming in here. # Probably warrants some defensive programming, but I'm not sure what to check for while still being generic - + # I don't like using append and lists here, but I can't work out how to do it better new_arrays = [] for i in range(multiarrays[0].ndim): - new_arrays.append(np.concatenate([ma.arrays[i] for ma in multiarrays],axis)) - + new_arrays.append(np.concatenate([ma.arrays[i] for ma in multiarrays], axis)) + out = MultiArray(new_arrays) return out + @implements(np.transpose) -def transpose_MultiArray(multiarray,axes): - return MultiArray([np.transpose(a,axes) for a in multiarray.arrays]) #This is gonna spit out a list and I'm not sure if I'm okay with that? +def transpose_MultiArray(multiarray, axes): + return MultiArray( + [np.transpose(a, axes) for a in multiarray.arrays] + ) # This is gonna spit out a list and I'm not sure if I'm okay with that? def _prepare_for_flox(group_idx, array): @@ -286,6 +299,7 @@ def _nan_grouped_op(group_idx, array, func, fillna, *args, **kwargs): nanmedian = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_, skipna=True)) # TODO: all, any + def sum_of_squares(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): return sum( group_idx, @@ -329,6 +343,7 @@ def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) return out + def var_chunk(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): # Calculate length and sum - important for the adjustment terms to sum squared deviations array_lens = nanlen( @@ -339,7 +354,7 @@ def var_chunk(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=No fill_value=fill_value, dtype=dtype, ) - + array_sums = sum( group_idx, array, @@ -348,21 +363,22 @@ def var_chunk(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=No fill_value=fill_value, dtype=dtype, ) - + # Calculate sum squared deviations - the main part of variance sum - array_means = array_sums/array_lens # Does this risk being run eagerly because it's not wrapped in anything? - + array_means = ( + array_sums / array_lens + ) # Does this risk being run eagerly because it's not wrapped in anything? + sum_squared_deviations = sum( group_idx, - (array-array_means[..., group_idx])**2, + (array - array_means[..., group_idx]) ** 2, axis=axis, size=size, fill_value=fill_value, dtype=dtype, - ) - - return MultiArray((sum_squared_deviations,array_sums,array_lens)) + ) + return MultiArray((sum_squared_deviations, array_sums, array_lens)) def ffill(group_idx, array, *, axis, **kwargs): diff --git a/flox/aggregations.py b/flox/aggregations.py index 56aec6a5d..f95428d6d 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -342,57 +342,63 @@ def _mean_finalize(sum_, count): final_dtype=np.floating, ) + def _var_combine(array, axis, keepdims=True): - def clip_last(array): - '''Return array except the last element along axis + """Return array except the last element along axis Purely included to tidy up the adj_terms line - ''' - not_last = [slice(None,None) for i in range(array.ndim)] - not_last[axis[0]] = slice(None,-1) + """ + not_last = [slice(None, None) for i in range(array.ndim)] + not_last[axis[0]] = slice(None, -1) return array[*not_last] + def clip_first(array): - '''Return array except the first element along axis + """Return array except the first element along axis Purely included to tidy up the adj_terms line - ''' - not_first = [slice(None,None) for i in range(array.ndim)] - not_first[axis[0]] = slice(1,None) - return array[*not_first] - - assert len(axis)==1, "Assuming that the combine function is only in one direction at once" - + """ + not_first = [slice(None, None) for i in range(array.ndim)] + not_first[axis[0]] = slice(1, None) + return array[*not_first] + + assert len(axis) == 1, "Assuming that the combine function is only in one direction at once" + # Does this double our memory footprint or are they just views? # If there's a huge memory impact, probably better to copy paste array.arrays[1] # in and accept the hit to readability sum_deviations = array.arrays[0] sum_X = array.arrays[1] - sum_len = array.arrays[2] - - # Calculate parts needed for cascading combination - cumsum_X = np.cumsum(sum_X,axis=axis[0]) #Don't need to be able to merge the last element - cumsum_len = np.cumsum(sum_len,axis=axis[0]) + sum_len = array.arrays[2] + # Calculate parts needed for cascading combination + cumsum_X = np.cumsum(sum_X, axis=axis[0]) # Don't need to be able to merge the last element + cumsum_len = np.cumsum(sum_len, axis=axis[0]) # Adjustment terms to tweak the sum of squared deviations because not every chunk has the same mean - adj_terms = ((clip_last(cumsum_len)*clip_first(sum_X)-clip_first(sum_len)*clip_last(cumsum_X))**2/ - (clip_last(cumsum_len)*clip_first(sum_len)*(clip_last(cumsum_len)+clip_first(sum_len)))) - + adj_terms = ( + clip_last(cumsum_len) * clip_first(sum_X) - clip_first(sum_len) * clip_last(cumsum_X) + ) ** 2 / (clip_last(cumsum_len) * clip_first(sum_len) * (clip_last(cumsum_len) + clip_first(sum_len))) + + return aggregate_flox.MultiArray( + ( + np.sum(sum_deviations, axis=axis, keepdims=keepdims) + + np.sum(adj_terms, axis=axis, keepdims=keepdims), # sum of squared deviations + np.sum(sum_X, axis=axis, keepdims=keepdims), # sum of array items + np.sum(sum_len, axis=axis, keepdims=keepdims), # sum of array lengths + ) + ) # I'm not even pretending calling this class from there is a good idea, I think it wants to be somewhere else though - return aggregate_flox.MultiArray((np.sum(sum_deviations,axis=axis,keepdims=keepdims)+np.sum(adj_terms,axis=axis,keepdims=keepdims), # sum of squared deviations - np.sum(sum_X,axis=axis,keepdims=keepdims), # sum of array items - np.sum(sum_len,axis=axis,keepdims=keepdims), # sum of array lengths - ))# I'm not even pretending calling this class from there is a good idea, I think it wants to be somewhere else though - # TODO: fix this for complex numbers -#def _var_finalize(sumsq, sum_, count, ddof=0): - #with np.errstate(invalid="ignore", divide="ignore"): - #result = (sumsq - (sum_**2 / count)) / (count - ddof) - #result[count <= ddof] = np.nan - #return result +# def _var_finalize(sumsq, sum_, count, ddof=0): +# with np.errstate(invalid="ignore", divide="ignore"): +# result = (sumsq - (sum_**2 / count)) / (count - ddof) +# result[count <= ddof] = np.nan +# return result + + +def _var_finalize(multiarray, ddof=0): + return multiarray.arrays[0] / (multiarray.arrays[2] - ddof) # Is this how ddof works again??? -def _var_finalize(multiarray,ddof=0): - return multiarray.arrays[0]/(multiarray.arrays[2]-ddof) # Is this how ddof works again??? def _std_finalize(sumsq, sum_, count, ddof=0): return np.sqrt(_var_finalize(sumsq, sum_, count, ddof)) @@ -409,16 +415,16 @@ def _std_finalize(sumsq, sum_, count, ddof=0): dtypes=(None, None, np.intp), final_dtype=np.floating, ) -#nanvar = Aggregation( - #"nanvar", - #chunk=("nansum_of_squares", "nansum", "nanlen"), - #combine=("sum", "sum", "sum"), - #finalize=_var_finalize, - #fill_value=0, - #final_fill_value=np.nan, - #dtypes=(None, None, np.intp), - #final_dtype=np.floating, -#) +# nanvar = Aggregation( +# "nanvar", +# chunk=("nansum_of_squares", "nansum", "nanlen"), +# combine=("sum", "sum", "sum"), +# finalize=_var_finalize, +# fill_value=0, +# final_fill_value=np.nan, +# dtypes=(None, None, np.intp), +# final_dtype=np.floating, +# ) nanvar = Aggregation( "nanvar", chunk=("var_chunk"), @@ -426,7 +432,7 @@ def _std_finalize(sumsq, sum_, count, ddof=0): finalize=_var_finalize, fill_value=0, final_fill_value=np.nan, - dtypes=(None, ), + dtypes=(None,), final_dtype=np.floating, ) std = Aggregation( From 322f5115509c2127316cc051fdf55c1b804b96cc Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 18 Jul 2025 09:47:42 -0600 Subject: [PATCH 03/15] [revert] only nanvar test --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 88169c0f5..7545480bd 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -235,7 +235,7 @@ def gen_array_by(size, func): @pytest.mark.parametrize("size", [(1, 12), (12,), (12, 9)]) @pytest.mark.parametrize("nby", [1, 2, 3]) @pytest.mark.parametrize("add_nan_by", [True, False]) -@pytest.mark.parametrize("func", ALL_FUNCS) +@pytest.mark.parametrize("func", ["nanvar"]) def test_groupby_reduce_all(to_sparse, nby, size, chunks, func, add_nan_by, engine): if ("arg" in func and engine in ["flox", "numbagg"]) or (func in BLOCKWISE_FUNCS and chunks != -1): pytest.skip() From adab8e68b2823d8cba82ff279332fc1e4bf5e858 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 18 Jul 2025 09:49:10 -0600 Subject: [PATCH 04/15] Some mods --- flox/aggregate_flox.py | 41 ++++-------------------------------- flox/aggregations.py | 48 +++++++++++++++++++++++++++++++++++++++++- flox/core.py | 8 ++++++- 3 files changed, 58 insertions(+), 39 deletions(-) diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index d5f0084c2..41309fcf0 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -1,4 +1,5 @@ from functools import partial +from typing import Self import numpy as np @@ -53,6 +54,9 @@ def shape(self) -> tuple[int, ...]: def ndim(self) -> int: return self.arrays[0].ndim + def __getitem__(self, key) -> Self: + return type(self)([array[key] for array in self.arrays]) + def implements(numpy_function): """Register an __array_function__ implementation for MyArray objects.""" @@ -344,43 +348,6 @@ def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None return out -def var_chunk(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): - # Calculate length and sum - important for the adjustment terms to sum squared deviations - array_lens = nanlen( - group_idx, - array, - axis=axis, - size=size, - fill_value=fill_value, - dtype=dtype, - ) - - array_sums = sum( - group_idx, - array, - axis=axis, - size=size, - fill_value=fill_value, - dtype=dtype, - ) - - # Calculate sum squared deviations - the main part of variance sum - array_means = ( - array_sums / array_lens - ) # Does this risk being run eagerly because it's not wrapped in anything? - - sum_squared_deviations = sum( - group_idx, - (array - array_means[..., group_idx]) ** 2, - axis=axis, - size=size, - fill_value=fill_value, - dtype=dtype, - ) - - return MultiArray((sum_squared_deviations, array_sums, array_lens)) - - def ffill(group_idx, array, *, axis, **kwargs): group_idx, array, perm = _prepare_for_flox(group_idx, array) shape = array.shape diff --git a/flox/aggregations.py b/flox/aggregations.py index f95428d6d..0961d13a7 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -343,6 +343,51 @@ def _mean_finalize(sum_, count): ) +def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=None, dtype=None): + from .aggregate_flox import MultiArray + + # Calculate length and sum - important for the adjustment terms to sum squared deviations + array_lens = generic_aggregate( + group_idx, + array, + func="nanlen", + engine=engine, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + array_sums = generic_aggregate( + group_idx, + array, + func="nansum", + engine=engine, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + # Calculate sum squared deviations - the main part of variance sum + array_means = ( + array_sums / array_lens + ) # Does this risk being run eagerly because it's not wrapped in anything? + + sum_squared_deviations = generic_aggregate( + group_idx, + (array - array_means[..., group_idx]) ** 2, + func="nansum", + engine=engine, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + return MultiArray((sum_squared_deviations, array_sums, array_lens)) + + def _var_combine(array, axis, keepdims=True): def clip_last(array): """Return array except the last element along axis @@ -427,7 +472,8 @@ def _std_finalize(sumsq, sum_, count, ddof=0): # ) nanvar = Aggregation( "nanvar", - chunk=("var_chunk"), + chunk=var_chunk, + numpy="nanvar", combine=(_var_combine,), finalize=_var_finalize, fill_value=0, diff --git a/flox/core.py b/flox/core.py index d8600b375..f8d724a16 100644 --- a/flox/core.py +++ b/flox/core.py @@ -46,6 +46,7 @@ _initialize_aggregation, generic_aggregate, quantile_new_dims_func, + var_chunk, ) from .cache import memoize from .lib import ArrayLayer, dask_array_type, sparse_array_type @@ -1251,7 +1252,8 @@ def chunk_reduce( # optimize that out. previous_reduction: T_Func = "" for reduction, fv, kw, dt in zip(funcs, fill_values, kwargss, dtypes): - if empty: + # UGLY! but this is because the `var` breaks our design assumptions + if empty and reduction is not var_chunk: result = np.full(shape=final_array_shape, fill_value=fv, like=array) elif is_nanlen(reduction) and is_nanlen(previous_reduction): result = results["intermediates"][-1] @@ -1260,6 +1262,10 @@ def chunk_reduce( kw_func = dict(size=size, dtype=dt, fill_value=fv) kw_func.update(kw) + # UGLY! but this is because the `var` breaks our design assumptions + if reduction is var_chunk: + kw_func.update(engine=engine) + if callable(reduction): # passing a custom reduction for npg to apply per-group is really slow! # So this `reduction` has to do the groupby-aggregation From 93cd9b309277a9512f6d63e251e27c5b5ef00dd3 Mon Sep 17 00:00:00 2001 From: jemmajeffree <98864717+jemmajeffree@users.noreply.github.com> Date: Mon, 21 Jul 2025 12:11:31 +1000 Subject: [PATCH 05/15] Update flox/aggregations.py to neater tuple unpacking Co-authored-by: Deepak Cherian --- flox/aggregations.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/flox/aggregations.py b/flox/aggregations.py index 0961d13a7..f989571f5 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -410,9 +410,7 @@ def clip_first(array): # Does this double our memory footprint or are they just views? # If there's a huge memory impact, probably better to copy paste array.arrays[1] # in and accept the hit to readability - sum_deviations = array.arrays[0] - sum_X = array.arrays[1] - sum_len = array.arrays[2] + sum_deviations, sum_X, sum_len = array.arrays # Calculate parts needed for cascading combination cumsum_X = np.cumsum(sum_X, axis=axis[0]) # Don't need to be able to merge the last element From 2be4f744d8eb97f7ba4ac27fd2ec4ba5bcacf0bc Mon Sep 17 00:00:00 2001 From: jemmajeffree <98864717+jemmajeffree@users.noreply.github.com> Date: Mon, 21 Jul 2025 12:19:10 +1000 Subject: [PATCH 06/15] Change np.all to all in flox/aggregate_flox.py Co-authored-by: Deepak Cherian --- flox/aggregate_flox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index 41309fcf0..ce128d283 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -15,7 +15,7 @@ class MultiArray: def __init__(self, arrays): self.arrays = arrays # something else needed here to be more careful about types (not sure what) # Do we want to co-erce arrays into a tuple and make sure it's immutable? Do we want it to be immutable? - assert np.all([arrays[0].shape == a.shape for a in arrays]), ( + assert all(arrays[0].shape == a.shape for a in arrays), ( "Expect all arrays to have the same shape" ) From edb655dd1f9bd661e9b16b1146cb1fd380cd7380 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Jul 2025 02:19:17 +0000 Subject: [PATCH 07/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flox/aggregate_flox.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index ce128d283..21cbd1119 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -15,9 +15,7 @@ class MultiArray: def __init__(self, arrays): self.arrays = arrays # something else needed here to be more careful about types (not sure what) # Do we want to co-erce arrays into a tuple and make sure it's immutable? Do we want it to be immutable? - assert all(arrays[0].shape == a.shape for a in arrays), ( - "Expect all arrays to have the same shape" - ) + assert all(arrays[0].shape == a.shape for a in arrays), "Expect all arrays to have the same shape" def astype(self, dt, **kwargs): new_arrays = [] # I really don't like doing this as a list From dd2e4b6a30784fca103d593c4b96c6056d72f6d9 Mon Sep 17 00:00:00 2001 From: jemmajeffree <98864717+jemmajeffree@users.noreply.github.com> Date: Mon, 21 Jul 2025 12:24:19 +1000 Subject: [PATCH 08/15] delete some resolved comments --- flox/aggregate_flox.py | 2 +- flox/aggregations.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index ea1aa2c1e..fa5a9501d 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -350,7 +350,7 @@ def var_chunk(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=No ) # Calculate sum squared deviations - the main part of variance sum - array_means = array_sums/array_lens # Does this risk being run eagerly because it's not wrapped in anything? + array_means = array_sums/array_lens sum_squared_deviations = sum( group_idx, diff --git a/flox/aggregations.py b/flox/aggregations.py index 56aec6a5d..10cb8add9 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -361,9 +361,6 @@ def clip_first(array): assert len(axis)==1, "Assuming that the combine function is only in one direction at once" - # Does this double our memory footprint or are they just views? - # If there's a huge memory impact, probably better to copy paste array.arrays[1] - # in and accept the hit to readability sum_deviations = array.arrays[0] sum_X = array.arrays[1] sum_len = array.arrays[2] From 19688709b3fa2ec76b33ed73744a91f14ee1b72c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Jul 2025 02:39:21 +0000 Subject: [PATCH 09/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flox/aggregate_flox.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index b818bd753..21cbd1119 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -345,6 +345,7 @@ def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) return out + def ffill(group_idx, array, *, axis, **kwargs): group_idx, array, perm = _prepare_for_flox(group_idx, array) shape = array.shape From 12bcb0f56296a2dac7835ed269d7dca68d36e081 Mon Sep 17 00:00:00 2001 From: jemmajeffree <98864717+jemmajeffree@users.noreply.github.com> Date: Mon, 21 Jul 2025 14:40:52 +1000 Subject: [PATCH 10/15] Remove more unnecessary comments --- flox/aggregations.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/flox/aggregations.py b/flox/aggregations.py index a96747579..e9bfb1667 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -372,7 +372,7 @@ def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=N # Calculate sum squared deviations - the main part of variance sum array_means = ( array_sums / array_lens - ) # Does this risk being run eagerly because it's not wrapped in anything? + ) sum_squared_deviations = generic_aggregate( group_idx, @@ -407,9 +407,6 @@ def clip_first(array): assert len(axis) == 1, "Assuming that the combine function is only in one direction at once" - # Does this double our memory footprint or are they just views? - # If there's a huge memory impact, probably better to copy paste array.arrays[1] - # in and accept the hit to readability sum_deviations, sum_X, sum_len = array.arrays # Calculate parts needed for cascading combination @@ -440,7 +437,7 @@ def clip_first(array): def _var_finalize(multiarray, ddof=0): - return multiarray.arrays[0] / (multiarray.arrays[2] - ddof) # Is this how ddof works again??? + return multiarray.arrays[0] / (multiarray.arrays[2] - ddof) def _std_finalize(sumsq, sum_, count, ddof=0): From b1f7b5d71b934669e44dd78ffa9120b7abaf87b3 Mon Sep 17 00:00:00 2001 From: jemmajeffree <98864717+jemmajeffree@users.noreply.github.com> Date: Mon, 21 Jul 2025 14:45:29 +1000 Subject: [PATCH 11/15] Remove _version.py --- flox/_version.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 flox/_version.py diff --git a/flox/_version.py b/flox/_version.py deleted file mode 100644 index a76cf84b5..000000000 --- a/flox/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.1.dev657+g619a390.d20250606" From cd9a8b83db27a1d9ae894a4ef37e98341115cb6a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Jul 2025 04:46:38 +0000 Subject: [PATCH 12/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flox/aggregations.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/flox/aggregations.py b/flox/aggregations.py index 1f4c59abf..975c3258e 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -370,9 +370,7 @@ def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=N ) # Calculate sum squared deviations - the main part of variance sum - array_means = ( - array_sums / array_lens - ) + array_means = array_sums / array_lens sum_squared_deviations = generic_aggregate( group_idx, From 27448e451c89aad1f307ae39f825060508822a38 Mon Sep 17 00:00:00 2001 From: jemmajeffree <98864717+jemmajeffree@users.noreply.github.com> Date: Thu, 31 Jul 2025 17:42:11 +1000 Subject: [PATCH 13/15] Add preliminary test for std/var precision --- tests/test_core.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 863e552dc..3926502e5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2240,3 +2240,25 @@ def test_sparse_nan_fill_value_reductions(chunks, fill_value, shape, func): expected = np.expand_dims(npfunc(numpy_array, axis=-1), axis=-1) actual, *_ = groupby_reduce(array, by, func=func, axis=-1) assert_equal(actual, expected) + + +@pytest.mark.parametrize("func", ("nanvar","var")) # Expect to expand this to other functions once written. Putting var in to begin with bc I know it will fail +@pytest.mark.parametrize("engine",("flox",)) # Expect to expand this to other engines once written +@pytest.mark.parametrize("offset",(0,10e2,10e4,10e6,10e8,10e10,10e12)) # Should fail at 10e8 for old algorithm, and survive 10e12 for current +def test_std_var_precision(func,engine,offset): + # Generate a dataset with small variance and big mean + # Check that func with engine gives you the same answer as numpy + + l =1000 + array = np.linspace(-1,1,l) # has zero mean + labels = np.arange(l)%2 # Ideally we'd parametrize this too. + + # These two need to be the same function, but with the offset added and not added + no_offset, _ = groupby_reduce(array, labels, engine=engine, func=func) + with_offset, _ = groupby_reduce(array+offset, labels, engine=engine, func=func) + + tol = {"rtol": 1e-8, "atol": 1e-10} # Not sure how stringent to be here + + # Failure threshold in my external tests is dependent on dask chunksize, maybe needs exploring better? + + assert_equal(no_offset, with_offset, tol) \ No newline at end of file From a81b1a3847f54ac8f4dca25aee35c86788c18766 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Jul 2025 07:47:15 +0000 Subject: [PATCH 14/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_core.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 3926502e5..4f1e28560 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2242,23 +2242,27 @@ def test_sparse_nan_fill_value_reductions(chunks, fill_value, shape, func): assert_equal(actual, expected) -@pytest.mark.parametrize("func", ("nanvar","var")) # Expect to expand this to other functions once written. Putting var in to begin with bc I know it will fail -@pytest.mark.parametrize("engine",("flox",)) # Expect to expand this to other engines once written -@pytest.mark.parametrize("offset",(0,10e2,10e4,10e6,10e8,10e10,10e12)) # Should fail at 10e8 for old algorithm, and survive 10e12 for current -def test_std_var_precision(func,engine,offset): +@pytest.mark.parametrize( + "func", ("nanvar", "var") +) # Expect to expand this to other functions once written. Putting var in to begin with bc I know it will fail +@pytest.mark.parametrize("engine", ("flox",)) # Expect to expand this to other engines once written +@pytest.mark.parametrize( + "offset", (0, 10e2, 10e4, 10e6, 10e8, 10e10, 10e12) +) # Should fail at 10e8 for old algorithm, and survive 10e12 for current +def test_std_var_precision(func, engine, offset): # Generate a dataset with small variance and big mean # Check that func with engine gives you the same answer as numpy - - l =1000 - array = np.linspace(-1,1,l) # has zero mean - labels = np.arange(l)%2 # Ideally we'd parametrize this too. - + + l = 1000 + array = np.linspace(-1, 1, l) # has zero mean + labels = np.arange(l) % 2 # Ideally we'd parametrize this too. + # These two need to be the same function, but with the offset added and not added no_offset, _ = groupby_reduce(array, labels, engine=engine, func=func) - with_offset, _ = groupby_reduce(array+offset, labels, engine=engine, func=func) - - tol = {"rtol": 1e-8, "atol": 1e-10} # Not sure how stringent to be here - + with_offset, _ = groupby_reduce(array + offset, labels, engine=engine, func=func) + + tol = {"rtol": 1e-8, "atol": 1e-10} # Not sure how stringent to be here + # Failure threshold in my external tests is dependent on dask chunksize, maybe needs exploring better? - - assert_equal(no_offset, with_offset, tol) \ No newline at end of file + + assert_equal(no_offset, with_offset, tol) From 004fddcc88e169a369e4505e7e9cdcac3f80dd39 Mon Sep 17 00:00:00 2001 From: jemmajeffree <98864717+jemmajeffree@users.noreply.github.com> Date: Thu, 31 Jul 2025 17:56:31 +1000 Subject: [PATCH 15/15] Correct comment --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 3926502e5..623c6ae28 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2242,7 +2242,7 @@ def test_sparse_nan_fill_value_reductions(chunks, fill_value, shape, func): assert_equal(actual, expected) -@pytest.mark.parametrize("func", ("nanvar","var")) # Expect to expand this to other functions once written. Putting var in to begin with bc I know it will fail +@pytest.mark.parametrize("func", ("nanvar","var")) # Expect to expand this to other functions once written. "nanvar" has updated chunk, combine functions. "var", for the moment, still uses the old algorithm @pytest.mark.parametrize("engine",("flox",)) # Expect to expand this to other engines once written @pytest.mark.parametrize("offset",(0,10e2,10e4,10e6,10e8,10e10,10e12)) # Should fail at 10e8 for old algorithm, and survive 10e12 for current def test_std_var_precision(func,engine,offset):