Skip to content

Commit 16829eb

Browse files
committed
Fix bug for read_parquet with offset timezones. #385
1 parent 1bd34c4 commit 16829eb

File tree

5 files changed

+25
-8
lines changed

5 files changed

+25
-8
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
> An [AWS Professional Service](https://aws.amazon.com/professional-services/) open source initiative | aws-proserve-opensource@amazon.com
77
8-
[![Release](https://img.shields.io/badge/release-1.9.2-brightgreen.svg)](https://pypi.org/project/awswrangler/)
8+
[![Release](https://img.shields.io/badge/release-1.9.3-brightgreen.svg)](https://pypi.org/project/awswrangler/)
99
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
1010
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
1111
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)

awswrangler/__metadata__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@
77

88
__title__: str = "awswrangler"
99
__description__: str = "Pandas on AWS."
10-
__version__: str = "1.9.2"
10+
__version__: str = "1.9.3"
1111
__license__: str = "Apache License 2.0"

awswrangler/s3/_read_parquet.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,11 @@ def _apply_index(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
192192
def _apply_timezone(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
193193
for c in metadata["columns"]:
194194
if c["field_name"] in df.columns and c["pandas_type"] == "datetimetz":
195-
_logger.debug("applying timezone (%s) on column %s", c["metadata"]["timezone"], c["field_name"])
196-
if isinstance(df[c["field_name"]].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype) is False:
195+
timezone: datetime.tzinfo = pa.lib.string_to_tzinfo(c["metadata"]["timezone"])
196+
_logger.debug("applying timezone (%s) on column %s", timezone, c["field_name"])
197+
if hasattr(df[c["field_name"]].dtype, "tz") is False:
197198
df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC")
198-
df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=c["metadata"]["timezone"])
199+
df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=timezone)
199200
return df
200201

201202

tests/test_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33

44
def test_metadata():
5-
assert wr.__version__ == "1.9.2"
5+
assert wr.__version__ == "1.9.3"
66
assert wr.__title__ == "awswrangler"
77
assert wr.__description__ == "Pandas on AWS."
88
assert wr.__license__ == "Apache License 2.0"

tests/test_s3_parquet.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import itertools
22
import logging
33
import math
4-
from datetime import datetime
4+
from datetime import datetime, timedelta, timezone
55

66
import boto3
77
import numpy as np
@@ -362,7 +362,7 @@ def test_timezone_file(path, use_threads):
362362
assert df.equals(df2)
363363

364364

365-
@pytest.mark.parametrize("use_threads", [False])
365+
@pytest.mark.parametrize("use_threads", [True, False])
366366
def test_timezone_file_columns(path, use_threads):
367367
file_path = f"{path}0.parquet"
368368
df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()], "c1": [1.1, 2.2]})
@@ -371,3 +371,19 @@ def test_timezone_file_columns(path, use_threads):
371371
wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads)
372372
df2 = wr.s3.read_parquet(path, columns=["c1"], use_threads=use_threads)
373373
assert df[["c1"]].equals(df2)
374+
375+
376+
@pytest.mark.parametrize("use_threads", [True, False])
377+
def test_timezone_raw_values(path, use_threads):
378+
df = pd.DataFrame({"c0": [1.1, 2.2], "par": ["a", "b"]})
379+
df["c1"] = pd.to_datetime(datetime.now(timezone.utc))
380+
df["c2"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(seconds=14400))))
381+
df["c3"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(-timedelta(seconds=14400))))
382+
df["c4"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(hours=-8))))
383+
paths = wr.s3.to_parquet(partition_cols=["par"], df=df, path=path, dataset=True, sanitize_columns=False)["paths"]
384+
wr.s3.wait_objects_exist(paths, use_threads=use_threads)
385+
df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
386+
df3 = pd.concat([pd.read_parquet(p) for p in paths], ignore_index=True)
387+
df2["par"] = df2["par"].astype("string")
388+
df3["par"] = df3["par"].astype("string")
389+
assert df2.equals(df3)

0 commit comments

Comments
 (0)