From bbb4096996be0129aa6493a57687b8bafaf97a47 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 7 Jan 2026 05:54:18 +0000 Subject: [PATCH 1/7] Fix: Handle empty index in unpivot operation --- bigframes/core/blocks.py | 21 ++++++++++ tests/unit/core/test_blocks_unpivot.py | 54 ++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 tests/unit/core/test_blocks_unpivot.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0f98f582c2..6f409c4f59 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -3471,6 +3471,27 @@ def _pd_index_to_array_value( Create an ArrayValue from a list of label tuples. The last column will be row offsets. """ + if index.empty: + id_gen = bigframes.core.identifiers.standard_id_strings() + col_ids = [next(id_gen) for _ in range(index.nlevels + 1)] + + data_dict = {} + if isinstance(index, pd.MultiIndex): + dtypes = index.dtypes.values.tolist() + else: + dtypes = [index.dtype] + + for col_id, dtype in zip(col_ids[:-1], dtypes): + try: + bf_dtype = bigframes.dtypes.bigframes_type(dtype) + pa_type = bigframes.dtypes.bigframes_dtype_to_arrow_dtype(bf_dtype) + except TypeError: + pa_type = pa.string() + data_dict[col_id] = pa.array([], type=pa_type) + + data_dict[col_ids[-1]] = pa.array([], type=pa.int64()) + table = pa.Table.from_pydict(data_dict) + return core.ArrayValue.from_pyarrow(table, session=session) rows = [] labels_as_tuples = utils.index_as_tuples(index) for row_offset in range(len(index)): diff --git a/tests/unit/core/test_blocks_unpivot.py b/tests/unit/core/test_blocks_unpivot.py new file mode 100644 index 0000000000..f20a421146 --- /dev/null +++ b/tests/unit/core/test_blocks_unpivot.py @@ -0,0 +1,54 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pandas as pd +import pytest + +from bigframes.core import blocks + + +@pytest.fixture +def mock_session(): + session = mock.MagicMock() + session.bqclient = None + return session + + +def test_pd_index_to_array_value_with_empty_index_creates_columns(mock_session): + """ + Tests that `_pd_index_to_array_value` correctly handles an empty pandas Index by creating + an ArrayValue with the expected columns (index column + offset column). + This prevents crashes in `unpivot` which expects these columns to exist. + """ + empty_index = pd.Index([], name="test") + + array_val = blocks._pd_index_to_array_value(mock_session, empty_index) + + # Should be 2: one for index, one for offset + assert len(array_val.column_ids) == 2 + + +def test_pd_index_to_array_value_with_empty_multiindex_creates_columns(mock_session): + """ + Tests that `_pd_index_to_array_value` correctly handles an empty pandas MultiIndex by creating + an ArrayValue with the expected columns (one for each level + offset column). + """ + empty_index = pd.MultiIndex.from_arrays([[], []], names=["a", "b"]) + + array_val = blocks._pd_index_to_array_value(mock_session, empty_index) + + # Should have 3 columns: a, b, offset + assert len(array_val.column_ids) == 3 From 9485b00833c0f50de36fdf83d19596fc72537846 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 7 Jan 2026 05:58:04 +0000 Subject: [PATCH 2/7] Fix: Add defensive check for empty column_ids in unpivot function --- bigframes/core/blocks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6f409c4f59..8ab6b4ceac 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -3433,6 +3433,9 @@ def unpivot( ) new_passthrough_cols = [column_mapping[col] for col in passthrough_columns] # Last column is offsets + if not labels_array.column_ids: + # Handle empty column_ids case for multimodal DataFrames + return array_value, (tuple(), tuple(), tuple(passthrough_columns)) index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]] explode_offsets_id = labels_mapping[labels_array.column_ids[-1]] From a53c64555f555a49fc54a4bf4e999d76014dce29 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 7 Jan 2026 06:18:15 +0000 Subject: [PATCH 3/7] Test: Add unit test for unpivot with empty row_labels --- tests/unit/core/test_blocks_unpivot.py | 45 +++++++++++++++++++------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/tests/unit/core/test_blocks_unpivot.py b/tests/unit/core/test_blocks_unpivot.py index f20a421146..64d9c5758a 100644 --- a/tests/unit/core/test_blocks_unpivot.py +++ b/tests/unit/core/test_blocks_unpivot.py @@ -27,28 +27,51 @@ def mock_session(): return session -def test_pd_index_to_array_value_with_empty_index_creates_columns(mock_session): +def test_pd_index_to_array_value_with_empty_index_creates_no_columns(mock_session): """ - Tests that `_pd_index_to_array_value` correctly handles an empty pandas Index by creating - an ArrayValue with the expected columns (index column + offset column). - This prevents crashes in `unpivot` which expects these columns to exist. + Tests that `_pd_index_to_array_value` with an empty pandas Index creates + an ArrayValue with no columns. """ empty_index = pd.Index([], name="test") array_val = blocks._pd_index_to_array_value(mock_session, empty_index) - # Should be 2: one for index, one for offset - assert len(array_val.column_ids) == 2 + assert len(array_val.column_ids) == 0 -def test_pd_index_to_array_value_with_empty_multiindex_creates_columns(mock_session): +def test_pd_index_to_array_value_with_empty_multiindex_creates_no_columns(mock_session): """ - Tests that `_pd_index_to_array_value` correctly handles an empty pandas MultiIndex by creating - an ArrayValue with the expected columns (one for each level + offset column). + Tests that `_pd_index_to_array_value` with an empty pandas MultiIndex creates + an ArrayValue with no columns. """ empty_index = pd.MultiIndex.from_arrays([[], []], names=["a", "b"]) array_val = blocks._pd_index_to_array_value(mock_session, empty_index) - # Should have 3 columns: a, b, offset - assert len(array_val.column_ids) == 3 + assert len(array_val.column_ids) == 0 + + +def test_unpivot_with_empty_row_labels(mock_session): + """ + Tests that `unpivot` handles an empty `row_labels` index correctly. + """ + import pyarrow as pa + + # Create a dummy ArrayValue + df = pd.DataFrame({"a": [1, 2, 3]}) + pa_table = pa.Table.from_pandas(df) + array_value = blocks.core.ArrayValue.from_pyarrow(pa_table, session=mock_session) + + # Call unpivot with an empty pd.Index + unpivot_result, (index_cols, unpivot_cols, passthrough_cols) = blocks.unpivot( + array_value, + row_labels=pd.Index([]), + unpivot_columns=[("a",)], + ) + + # The expected behavior is that the unpivot operation does nothing and returns + # the original array_value and empty column tuples. + assert unpivot_result is array_value + assert index_cols == tuple() + assert unpivot_cols == tuple() + assert passthrough_cols == tuple() From 7ad3a04c808d843401d6fe78cc8fca812b5de0b6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 7 Jan 2026 06:18:44 +0000 Subject: [PATCH 4/7] Revert: Revert incorrect fix for empty index in _pd_index_to_array_value --- bigframes/core/blocks.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 8ab6b4ceac..a49fa3947a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -3474,27 +3474,7 @@ def _pd_index_to_array_value( Create an ArrayValue from a list of label tuples. The last column will be row offsets. """ - if index.empty: - id_gen = bigframes.core.identifiers.standard_id_strings() - col_ids = [next(id_gen) for _ in range(index.nlevels + 1)] - - data_dict = {} - if isinstance(index, pd.MultiIndex): - dtypes = index.dtypes.values.tolist() - else: - dtypes = [index.dtype] - for col_id, dtype in zip(col_ids[:-1], dtypes): - try: - bf_dtype = bigframes.dtypes.bigframes_type(dtype) - pa_type = bigframes.dtypes.bigframes_dtype_to_arrow_dtype(bf_dtype) - except TypeError: - pa_type = pa.string() - data_dict[col_id] = pa.array([], type=pa_type) - - data_dict[col_ids[-1]] = pa.array([], type=pa.int64()) - table = pa.Table.from_pydict(data_dict) - return core.ArrayValue.from_pyarrow(table, session=session) rows = [] labels_as_tuples = utils.index_as_tuples(index) for row_offset in range(len(index)): From 132227e4ef6d3f1ff443b573f590f9c001ff9647 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 7 Jan 2026 06:27:46 +0000 Subject: [PATCH 5/7] Fix: Handle empty index in unpivot with identity mapping --- bigframes/core/blocks.py | 12 ++++++++++-- tests/unit/core/test_blocks_unpivot.py | 11 ++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index a49fa3947a..a327df5905 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -3435,7 +3435,15 @@ def unpivot( # Last column is offsets if not labels_array.column_ids: # Handle empty column_ids case for multimodal DataFrames - return array_value, (tuple(), tuple(), tuple(passthrough_columns)) + # When no index columns exist, return original array_value with identity mappings + value_cols = [ + col for col in array_value.column_ids if col not in passthrough_columns + ] + return array_value, ( + tuple(), + tuple(value_cols), + tuple(passthrough_columns), + ) index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]] explode_offsets_id = labels_mapping[labels_array.column_ids[-1]] @@ -3462,7 +3470,7 @@ def unpivot( joined_array, unpivot_col_ids = joined_array.compute_values(unpivot_exprs) return joined_array.select_columns( - [*index_col_ids, *unpivot_col_ids, *new_passthrough_cols] + [*index_col_ids, *unpivot_col_ids, *new_passthrough_cols], allow_renames=True ), (tuple(index_col_ids), tuple(unpivot_col_ids), tuple(new_passthrough_cols)) diff --git a/tests/unit/core/test_blocks_unpivot.py b/tests/unit/core/test_blocks_unpivot.py index 64d9c5758a..5ad464d114 100644 --- a/tests/unit/core/test_blocks_unpivot.py +++ b/tests/unit/core/test_blocks_unpivot.py @@ -58,20 +58,21 @@ def test_unpivot_with_empty_row_labels(mock_session): import pyarrow as pa # Create a dummy ArrayValue - df = pd.DataFrame({"a": [1, 2, 3]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) pa_table = pa.Table.from_pandas(df) array_value = blocks.core.ArrayValue.from_pyarrow(pa_table, session=mock_session) # Call unpivot with an empty pd.Index - unpivot_result, (index_cols, unpivot_cols, passthrough_cols) = blocks.unpivot( + unpivot_result, (index_cols, value_cols, passthrough_cols) = blocks.unpivot( array_value, row_labels=pd.Index([]), unpivot_columns=[("a",)], + passthrough_columns=["b"], ) # The expected behavior is that the unpivot operation does nothing and returns - # the original array_value and empty column tuples. + # the original array_value and identity mappings. assert unpivot_result is array_value assert index_cols == tuple() - assert unpivot_cols == tuple() - assert passthrough_cols == tuple() + assert value_cols == ("a",) + assert passthrough_cols == ("b",) From 991a36374941c6dc8991e30fc6779d41ce38cbee Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 12 Jan 2026 21:36:34 +0000 Subject: [PATCH 6/7] fix: fix the length mismatch --- bigframes/core/blocks.py | 40 +++++++++++++---------- bigframes/pandas/core/methods/describe.py | 33 ++++++++++++------- tests/unit/core/test_blocks_unpivot.py | 21 ++++++------ 3 files changed, 54 insertions(+), 40 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index a327df5905..ca06c8859e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -3432,18 +3432,6 @@ def unpivot( array_value, type="cross" ) new_passthrough_cols = [column_mapping[col] for col in passthrough_columns] - # Last column is offsets - if not labels_array.column_ids: - # Handle empty column_ids case for multimodal DataFrames - # When no index columns exist, return original array_value with identity mappings - value_cols = [ - col for col in array_value.column_ids if col not in passthrough_columns - ] - return array_value, ( - tuple(), - tuple(value_cols), - tuple(passthrough_columns), - ) index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]] explode_offsets_id = labels_mapping[labels_array.column_ids[-1]] @@ -3453,6 +3441,10 @@ def unpivot( for input_ids in unpivot_columns: # row explode offset used to choose the input column # we use offset instead of label as labels are not necessarily unique + if not input_ids: + unpivot_exprs.append(ex.const(None)) + continue + cases = itertools.chain( *( ( @@ -3482,19 +3474,31 @@ def _pd_index_to_array_value( Create an ArrayValue from a list of label tuples. The last column will be row offsets. """ + id_gen = bigframes.core.identifiers.standard_id_strings() + index_ids = [next(id_gen) for _ in range(index.nlevels)] + offset_id = next(id_gen) rows = [] labels_as_tuples = utils.index_as_tuples(index) for row_offset in range(len(index)): - id_gen = bigframes.core.identifiers.standard_id_strings() row_label = labels_as_tuples[row_offset] - row_label = (row_label,) if not isinstance(row_label, tuple) else row_label - row = {} - for label_part, id in zip(row_label, id_gen): - row[id] = label_part if pd.notnull(label_part) else None - row[next(id_gen)] = row_offset + row = { + id: (val if pd.notnull(val) else None) + for id, val in zip(index_ids, row_label) + } + row[offset_id] = row_offset rows.append(row) + if not rows: + # Create empty table with correct columns + schema = pa.schema( + [pa.field(id, pa.null()) for id in index_ids] + + [pa.field(offset_id, pa.int64())] + ) + return core.ArrayValue.from_pyarrow( + pa.Table.from_batches([], schema=schema), session=session + ) + return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session) diff --git a/bigframes/pandas/core/methods/describe.py b/bigframes/pandas/core/methods/describe.py index 6fd7960daf..7fbd025b27 100644 --- a/bigframes/pandas/core/methods/describe.py +++ b/bigframes/pandas/core/methods/describe.py @@ -17,6 +17,7 @@ import typing import pandas as pd +import pyarrow as pa from bigframes import dataframe, dtypes, series from bigframes.core import agg_expressions, blocks @@ -86,9 +87,13 @@ def _describe( if include != "all" and dtype not in _DEFAULT_DTYPES: continue agg_ops = _get_aggs_for_dtype(dtype) - stats.extend(op.as_expr(col_id) for op in agg_ops) - label_tuple = (label,) if block.column_labels.nlevels == 1 else label - column_labels.extend((*label_tuple, op.name) for op in agg_ops) # type: ignore + + label_tuple = ( + (label,) if block.column_labels.nlevels == 1 else typing.cast(tuple, label) + ) + for op in agg_ops: + stats.append(op.as_expr(col_id)) + column_labels.append((*label_tuple, op.name)) agg_block = block.aggregate( by_column_ids=by_col_ids, @@ -100,7 +105,7 @@ def _describe( def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]: - if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: + if dtypes.is_numeric(dtype, include_bool=False): return [ aggregations.count_op, aggregations.mean_op, @@ -111,14 +116,18 @@ def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]: aggregations.ApproxQuartilesOp(3), aggregations.max_op, ] - elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES: + elif dtypes.is_datetime_like(dtype) or dtypes.is_date_like(dtype): return [aggregations.count_op] - elif dtype in [ - dtypes.STRING_DTYPE, - dtypes.BOOL_DTYPE, - dtypes.BYTES_DTYPE, - dtypes.TIME_DTYPE, - ]: + elif ( + dtypes.is_string_like(dtype) + or dtypes.is_binary_like(dtype) + or dtypes.is_time_like(dtype) + or ( + isinstance(dtype, pd.ArrowDtype) + and pa.types.is_struct(dtype.pyarrow_dtype) + and dtype != dtypes.OBJ_REF_DTYPE + ) + ): return [aggregations.count_op, aggregations.nunique_op] else: - return [] + return [aggregations.count_op] diff --git a/tests/unit/core/test_blocks_unpivot.py b/tests/unit/core/test_blocks_unpivot.py index 5ad464d114..b1bde67ff3 100644 --- a/tests/unit/core/test_blocks_unpivot.py +++ b/tests/unit/core/test_blocks_unpivot.py @@ -30,30 +30,32 @@ def mock_session(): def test_pd_index_to_array_value_with_empty_index_creates_no_columns(mock_session): """ Tests that `_pd_index_to_array_value` with an empty pandas Index creates - an ArrayValue with no columns. + an ArrayValue with the expected number of columns (index level + offset). """ empty_index = pd.Index([], name="test") array_val = blocks._pd_index_to_array_value(mock_session, empty_index) - assert len(array_val.column_ids) == 0 + # 1 index level + 1 offset column + assert len(array_val.column_ids) == 2 def test_pd_index_to_array_value_with_empty_multiindex_creates_no_columns(mock_session): """ Tests that `_pd_index_to_array_value` with an empty pandas MultiIndex creates - an ArrayValue with no columns. + an ArrayValue with the expected number of columns (index levels + offset). """ empty_index = pd.MultiIndex.from_arrays([[], []], names=["a", "b"]) array_val = blocks._pd_index_to_array_value(mock_session, empty_index) - assert len(array_val.column_ids) == 0 + # 2 index levels + 1 offset column + assert len(array_val.column_ids) == 3 def test_unpivot_with_empty_row_labels(mock_session): """ - Tests that `unpivot` handles an empty `row_labels` index correctly. + Tests that `unpivot` handles an empty `row_labels` index correctly by producing 0 rows. """ import pyarrow as pa @@ -70,9 +72,8 @@ def test_unpivot_with_empty_row_labels(mock_session): passthrough_columns=["b"], ) - # The expected behavior is that the unpivot operation does nothing and returns - # the original array_value and identity mappings. - assert unpivot_result is array_value - assert index_cols == tuple() - assert value_cols == ("a",) + # The expected behavior is that the unpivot operation produces 0 rows. + assert unpivot_result is not array_value + assert index_cols == ("col_0",) + assert len(value_cols) == 1 assert passthrough_cols == ("b",) From a71776d41f4838f681030495fd97ba18dbfd0cbf Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 12 Jan 2026 21:44:47 +0000 Subject: [PATCH 7/7] fix: 'value_columns' and 'column_labels' have equal length --- bigframes/pandas/core/methods/describe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/pandas/core/methods/describe.py b/bigframes/pandas/core/methods/describe.py index 7fbd025b27..eb12ac7831 100644 --- a/bigframes/pandas/core/methods/describe.py +++ b/bigframes/pandas/core/methods/describe.py @@ -125,7 +125,7 @@ def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]: or ( isinstance(dtype, pd.ArrowDtype) and pa.types.is_struct(dtype.pyarrow_dtype) - and dtype != dtypes.OBJ_REF_DTYPE + and not dtypes.contains_db_dtypes_json_dtype(dtype) ) ): return [aggregations.count_op, aggregations.nunique_op]