Fix bug for read_parquet with offset timezones. #385

igorborgest · igorborgest · commit 16829ebda39c · 2020-09-08T19:51:13.000-03:00
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 
 > An [AWS Professional Service](https://aws.amazon.com/professional-services/) open source initiative | aws-proserve-opensource@amazon.com
 
-[![Release](https://img.shields.io/badge/release-1.9.2-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Release](https://img.shields.io/badge/release-1.9.3-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-brightgreen.svg)](https://anaconda.org/conda-forge/awswrangler)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
diff --git a/awswrangler/__metadata__.py b/awswrangler/__metadata__.py
@@ -7,5 +7,5 @@
 
 __title__: str = "awswrangler"
 __description__: str = "Pandas on AWS."
-__version__: str = "1.9.2"
+__version__: str = "1.9.3"
 __license__: str = "Apache License 2.0"
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -192,10 +192,11 @@ def _apply_index(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
 def _apply_timezone(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
     for c in metadata["columns"]:
         if c["field_name"] in df.columns and c["pandas_type"] == "datetimetz":
-            _logger.debug("applying timezone (%s) on column %s", c["metadata"]["timezone"], c["field_name"])
-            if isinstance(df[c["field_name"]].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype) is False:
+            timezone: datetime.tzinfo = pa.lib.string_to_tzinfo(c["metadata"]["timezone"])
+            _logger.debug("applying timezone (%s) on column %s", timezone, c["field_name"])
+            if hasattr(df[c["field_name"]].dtype, "tz") is False:
                 df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC")
-            df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=c["metadata"]["timezone"])
+            df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=timezone)
     return df
 
 
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -2,7 +2,7 @@
 
 
 def test_metadata():
-    assert wr.__version__ == "1.9.2"
+    assert wr.__version__ == "1.9.3"
     assert wr.__title__ == "awswrangler"
     assert wr.__description__ == "Pandas on AWS."
     assert wr.__license__ == "Apache License 2.0"
diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py
@@ -1,7 +1,7 @@
 import itertools
 import logging
 import math
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 
 import boto3
 import numpy as np
@@ -362,7 +362,7 @@ def test_timezone_file(path, use_threads):
     assert df.equals(df2)
 
 
-@pytest.mark.parametrize("use_threads", [False])
+@pytest.mark.parametrize("use_threads", [True, False])
 def test_timezone_file_columns(path, use_threads):
     file_path = f"{path}0.parquet"
     df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()], "c1": [1.1, 2.2]})
@@ -371,3 +371,19 @@ def test_timezone_file_columns(path, use_threads):
     wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads)
     df2 = wr.s3.read_parquet(path, columns=["c1"], use_threads=use_threads)
     assert df[["c1"]].equals(df2)
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+def test_timezone_raw_values(path, use_threads):
+    df = pd.DataFrame({"c0": [1.1, 2.2], "par": ["a", "b"]})
+    df["c1"] = pd.to_datetime(datetime.now(timezone.utc))
+    df["c2"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(seconds=14400))))
+    df["c3"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(-timedelta(seconds=14400))))
+    df["c4"] = pd.to_datetime(datetime(2011, 11, 4, 0, 5, 23, tzinfo=timezone(timedelta(hours=-8))))
+    paths = wr.s3.to_parquet(partition_cols=["par"], df=df, path=path, dataset=True, sanitize_columns=False)["paths"]
+    wr.s3.wait_objects_exist(paths, use_threads=use_threads)
+    df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
+    df3 = pd.concat([pd.read_parquet(p) for p in paths], ignore_index=True)
+    df2["par"] = df2["par"].astype("string")
+    df3["par"] = df3["par"].astype("string")
+    assert df2.equals(df3)