diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0f98f582c2..ca06c8859e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -3432,7 +3432,6 @@ def unpivot( array_value, type="cross" ) new_passthrough_cols = [column_mapping[col] for col in passthrough_columns] - # Last column is offsets index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]] explode_offsets_id = labels_mapping[labels_array.column_ids[-1]] @@ -3442,6 +3441,10 @@ def unpivot( for input_ids in unpivot_columns: # row explode offset used to choose the input column # we use offset instead of label as labels are not necessarily unique + if not input_ids: + unpivot_exprs.append(ex.const(None)) + continue + cases = itertools.chain( *( ( @@ -3459,7 +3462,7 @@ def unpivot( joined_array, unpivot_col_ids = joined_array.compute_values(unpivot_exprs) return joined_array.select_columns( - [*index_col_ids, *unpivot_col_ids, *new_passthrough_cols] + [*index_col_ids, *unpivot_col_ids, *new_passthrough_cols], allow_renames=True ), (tuple(index_col_ids), tuple(unpivot_col_ids), tuple(new_passthrough_cols)) @@ -3471,18 +3474,31 @@ def _pd_index_to_array_value( Create an ArrayValue from a list of label tuples. The last column will be row offsets. """ + id_gen = bigframes.core.identifiers.standard_id_strings() + index_ids = [next(id_gen) for _ in range(index.nlevels)] + offset_id = next(id_gen) + rows = [] labels_as_tuples = utils.index_as_tuples(index) for row_offset in range(len(index)): - id_gen = bigframes.core.identifiers.standard_id_strings() row_label = labels_as_tuples[row_offset] - row_label = (row_label,) if not isinstance(row_label, tuple) else row_label - row = {} - for label_part, id in zip(row_label, id_gen): - row[id] = label_part if pd.notnull(label_part) else None - row[next(id_gen)] = row_offset + row = { + id: (val if pd.notnull(val) else None) + for id, val in zip(index_ids, row_label) + } + row[offset_id] = row_offset rows.append(row) + if not rows: + # Create empty table with correct columns + schema = pa.schema( + [pa.field(id, pa.null()) for id in index_ids] + + [pa.field(offset_id, pa.int64())] + ) + return core.ArrayValue.from_pyarrow( + pa.Table.from_batches([], schema=schema), session=session + ) + return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session) diff --git a/bigframes/pandas/core/methods/describe.py b/bigframes/pandas/core/methods/describe.py index 6fd7960daf..eb12ac7831 100644 --- a/bigframes/pandas/core/methods/describe.py +++ b/bigframes/pandas/core/methods/describe.py @@ -17,6 +17,7 @@ import typing import pandas as pd +import pyarrow as pa from bigframes import dataframe, dtypes, series from bigframes.core import agg_expressions, blocks @@ -86,9 +87,13 @@ def _describe( if include != "all" and dtype not in _DEFAULT_DTYPES: continue agg_ops = _get_aggs_for_dtype(dtype) - stats.extend(op.as_expr(col_id) for op in agg_ops) - label_tuple = (label,) if block.column_labels.nlevels == 1 else label - column_labels.extend((*label_tuple, op.name) for op in agg_ops) # type: ignore + + label_tuple = ( + (label,) if block.column_labels.nlevels == 1 else typing.cast(tuple, label) + ) + for op in agg_ops: + stats.append(op.as_expr(col_id)) + column_labels.append((*label_tuple, op.name)) agg_block = block.aggregate( by_column_ids=by_col_ids, @@ -100,7 +105,7 @@ def _describe( def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]: - if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: + if dtypes.is_numeric(dtype, include_bool=False): return [ aggregations.count_op, aggregations.mean_op, @@ -111,14 +116,18 @@ def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]: aggregations.ApproxQuartilesOp(3), aggregations.max_op, ] - elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES: + elif dtypes.is_datetime_like(dtype) or dtypes.is_date_like(dtype): return [aggregations.count_op] - elif dtype in [ - dtypes.STRING_DTYPE, - dtypes.BOOL_DTYPE, - dtypes.BYTES_DTYPE, - dtypes.TIME_DTYPE, - ]: + elif ( + dtypes.is_string_like(dtype) + or dtypes.is_binary_like(dtype) + or dtypes.is_time_like(dtype) + or ( + isinstance(dtype, pd.ArrowDtype) + and pa.types.is_struct(dtype.pyarrow_dtype) + and not dtypes.contains_db_dtypes_json_dtype(dtype) + ) + ): return [aggregations.count_op, aggregations.nunique_op] else: - return [] + return [aggregations.count_op] diff --git a/tests/unit/core/test_blocks_unpivot.py b/tests/unit/core/test_blocks_unpivot.py new file mode 100644 index 0000000000..b1bde67ff3 --- /dev/null +++ b/tests/unit/core/test_blocks_unpivot.py @@ -0,0 +1,79 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pandas as pd +import pytest + +from bigframes.core import blocks + + +@pytest.fixture +def mock_session(): + session = mock.MagicMock() + session.bqclient = None + return session + + +def test_pd_index_to_array_value_with_empty_index_creates_no_columns(mock_session): + """ + Tests that `_pd_index_to_array_value` with an empty pandas Index creates + an ArrayValue with the expected number of columns (index level + offset). + """ + empty_index = pd.Index([], name="test") + + array_val = blocks._pd_index_to_array_value(mock_session, empty_index) + + # 1 index level + 1 offset column + assert len(array_val.column_ids) == 2 + + +def test_pd_index_to_array_value_with_empty_multiindex_creates_no_columns(mock_session): + """ + Tests that `_pd_index_to_array_value` with an empty pandas MultiIndex creates + an ArrayValue with the expected number of columns (index levels + offset). + """ + empty_index = pd.MultiIndex.from_arrays([[], []], names=["a", "b"]) + + array_val = blocks._pd_index_to_array_value(mock_session, empty_index) + + # 2 index levels + 1 offset column + assert len(array_val.column_ids) == 3 + + +def test_unpivot_with_empty_row_labels(mock_session): + """ + Tests that `unpivot` handles an empty `row_labels` index correctly by producing 0 rows. + """ + import pyarrow as pa + + # Create a dummy ArrayValue + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + pa_table = pa.Table.from_pandas(df) + array_value = blocks.core.ArrayValue.from_pyarrow(pa_table, session=mock_session) + + # Call unpivot with an empty pd.Index + unpivot_result, (index_cols, value_cols, passthrough_cols) = blocks.unpivot( + array_value, + row_labels=pd.Index([]), + unpivot_columns=[("a",)], + passthrough_columns=["b"], + ) + + # The expected behavior is that the unpivot operation produces 0 rows. + assert unpivot_result is not array_value + assert index_cols == ("col_0",) + assert len(value_cols) == 1 + assert passthrough_cols == ("b",)