Skip to content

Commit a51f2d3

Browse files
authored
fix checking physical type for Decimal type in StatsAggregator (#2515)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Fix for #2057. It looks like the initial fix #1839 might have missed updating here to handle. I could use feedback on if this is the best fix, it is at least simple. ## Are these changes tested? - [x] added unit tests ## Are there any user-facing changes? No <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent a793fe5 commit a51f2d3

File tree

2 files changed

+52
-6
lines changed

2 files changed

+52
-6
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2082,13 +2082,18 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc
20822082
self.trunc_length = trunc_length
20832083

20842084
expected_physical_type = _primitive_to_physical(iceberg_type)
2085+
2086+
# TODO: Refactor to use promotion logic
20852087
if expected_physical_type != physical_type_string:
20862088
# Allow promotable physical types
20872089
# INT32 -> INT64 and FLOAT -> DOUBLE are safe type casts
20882090
if (physical_type_string == "INT32" and expected_physical_type == "INT64") or (
20892091
physical_type_string == "FLOAT" and expected_physical_type == "DOUBLE"
20902092
):
20912093
pass
2094+
# Allow DECIMAL to be stored as FIXED_LEN_BYTE_ARRAY, INT32 or INT64
2095+
elif physical_type_string == "FIXED_LEN_BYTE_ARRAY" and expected_physical_type in ("INT32", "INT64"):
2096+
pass
20922097
else:
20932098
raise ValueError(
20942099
f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}"
@@ -2506,12 +2511,16 @@ def data_file_statistics_from_parquet_metadata(
25062511

25072512
if isinstance(stats_col.iceberg_type, DecimalType) and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY":
25082513
scale = stats_col.iceberg_type.scale
2509-
col_aggs[field_id].update_min(
2510-
unscaled_to_decimal(statistics.min_raw, scale)
2511-
) if statistics.min_raw is not None else None
2512-
col_aggs[field_id].update_max(
2513-
unscaled_to_decimal(statistics.max_raw, scale)
2514-
) if statistics.max_raw is not None else None
2514+
(
2515+
col_aggs[field_id].update_min(unscaled_to_decimal(statistics.min_raw, scale))
2516+
if statistics.min_raw is not None
2517+
else None
2518+
)
2519+
(
2520+
col_aggs[field_id].update_max(unscaled_to_decimal(statistics.max_raw, scale))
2521+
if statistics.max_raw is not None
2522+
else None
2523+
)
25152524
else:
25162525
col_aggs[field_id].update_min(statistics.min)
25172526
col_aggs[field_id].update_max(statistics.max)

tests/io/test_pyarrow.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2188,6 +2188,43 @@ def test_stats_aggregator_update_max(vals: List[Any], primitive_type: PrimitiveT
21882188
assert stats.current_max == expected_result
21892189

21902190

2191+
@pytest.mark.parametrize(
2192+
"iceberg_type, physical_type_string",
2193+
[
2194+
# Exact match
2195+
(IntegerType(), "INT32"),
2196+
# Allowed INT32 -> INT64 promotion
2197+
(LongType(), "INT32"),
2198+
# Allowed FLOAT -> DOUBLE promotion
2199+
(DoubleType(), "FLOAT"),
2200+
# Allowed FIXED_LEN_BYTE_ARRAY -> INT32
2201+
(DecimalType(precision=2, scale=2), "FIXED_LEN_BYTE_ARRAY"),
2202+
# Allowed FIXED_LEN_BYTE_ARRAY -> INT64
2203+
(DecimalType(precision=12, scale=2), "FIXED_LEN_BYTE_ARRAY"),
2204+
],
2205+
)
2206+
def test_stats_aggregator_conditionally_allowed_types_pass(iceberg_type: PrimitiveType, physical_type_string: str) -> None:
2207+
stats = StatsAggregator(iceberg_type, physical_type_string)
2208+
2209+
assert stats.primitive_type == iceberg_type
2210+
assert stats.current_min is None
2211+
assert stats.current_max is None
2212+
2213+
2214+
@pytest.mark.parametrize(
2215+
"iceberg_type, physical_type_string",
2216+
[
2217+
# Fail case: INT64 cannot be cast to INT32
2218+
(IntegerType(), "INT64"),
2219+
],
2220+
)
2221+
def test_stats_aggregator_physical_type_does_not_match_expected_raise_error(
2222+
iceberg_type: PrimitiveType, physical_type_string: str
2223+
) -> None:
2224+
with pytest.raises(ValueError, match="Unexpected physical type"):
2225+
StatsAggregator(iceberg_type, physical_type_string)
2226+
2227+
21912228
def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None:
21922229
# default packs to 1 bin since the table is small
21932230
bin_packed = bin_pack_arrow_table(

0 commit comments

Comments
 (0)