Merge pull request #78 from awslabs/decimal

igorborgest · web-flow · commit 24d315eee2c0 · 2019-12-04T19:00:11.000-03:00
add Decimal type support
diff --git a/awswrangler/data_types.py b/awswrangler/data_types.py
@@ -26,6 +26,8 @@ def athena2pandas(dtype: str) -> str:
         return "date"
     elif dtype == "array":
         return "list"
+    elif dtype == "decimal":
+        return "decimal"
     else:
         raise UnsupportedType(f"Unsupported Athena type: {dtype}")
 
@@ -162,6 +164,8 @@ def pyarrow2athena(dtype: pa.types) -> str:
         return "timestamp"
     elif dtype_str.startswith("date"):
         return "date"
+    elif dtype_str.startswith("decimal"):
+        return dtype_str.replace(" ", "")
     elif dtype_str.startswith("list"):
         return f"array<{pyarrow2athena(dtype.value_type)}>"
     elif dtype_str == "null":
@@ -190,6 +194,8 @@ def pyarrow2redshift(dtype: pa.types) -> str:
         return "TIMESTAMP"
     elif dtype_str.startswith("date"):
         return "DATE"
+    elif dtype_str.startswith("decimal"):
+        return dtype_str.replace(" ", "").upper()
     else:
         raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
 
@@ -280,6 +286,8 @@ def spark2redshift(dtype: str) -> str:
         return "DATE"
     elif dtype == "string":
         return "VARCHAR(256)"
+    elif dtype.startswith("decimal"):
+        return dtype.replace(" ", "").upper()
     else:
         raise UnsupportedType("Unsupported Spark type: " + dtype)
 
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -6,7 +6,7 @@
 import copy
 import csv
 from datetime import datetime
-import ast
+from decimal import Decimal
 
 from botocore.exceptions import ClientError, HTTPClientError  # type: ignore
 import pandas as pd  # type: ignore
@@ -471,6 +471,8 @@ def _get_query_dtype(self, query_execution_id: str) -> Tuple[Dict[str, str], Lis
                 converters[col_name] = Pandas._list_parser
             elif pandas_type == "bool":
                 logger.debug(f"Ignoring bool column: {col_name}")
+            elif pandas_type == "decimal":
+                converters[col_name] = lambda x: Decimal(str(x)) if str(x) != "" else None
             else:
                 dtype[col_name] = pandas_type
         logger.debug(f"dtype: {dtype}")
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -2,6 +2,7 @@
 import logging
 import csv
 from datetime import datetime, date
+from decimal import Decimal
 
 import pytest
 import boto3
@@ -1303,7 +1304,6 @@ def test_to_parquet_array(session, bucket, database):
         df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
         if len(df.index) == len(df2.index):
             break
-    print(df2)
     session.s3.delete_objects(path=path)
 
     assert len(list(df.columns)) == len(list(df2.columns))
@@ -1312,3 +1312,37 @@ def test_to_parquet_array(session, bucket, database):
     assert df2[df2.a == 2].iloc[0].b[0] == 4.0
     assert df2[df2.a == 2].iloc[0].c[0] == 7
     assert df2[df2.a == 2].iloc[0].d[0] == "foo"
+
+
+def test_to_parquet_decimal(session, bucket, database):
+    df = pd.DataFrame({
+        "id": [1, 2, 3],
+        "decimal_2": [Decimal((0, (1, 9, 9), -2)), None, Decimal((0, (1, 9, 0), -2))],
+        "decimal_5": [Decimal((0, (1, 9, 9, 9, 9, 9), -5)), None, Decimal((0, (1, 9, 0, 0, 0, 0), -5))],
+    })
+    print(df)
+    print(df.dtypes)
+    path = f"s3://{bucket}/test/"
+    session.pandas.to_parquet(dataframe=df,
+                              database=database,
+                              path=path,
+                              mode="overwrite",
+                              preserve_index=False,
+                              procs_cpu_bound=1)
+    df2 = None
+    for counter in range(10):  # Retrying to workaround s3 eventual consistency
+        sleep(1)
+        df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
+        if len(df.index) == len(df2.index):
+            break
+    session.s3.delete_objects(path=path)
+
+    assert len(list(df.columns)) == len(list(df2.columns))
+    assert len(df.index) == len(df2.index)
+
+    assert df2[df2.id == 1].iloc[0].decimal_2 == Decimal((0, (1, 9, 9), -2))
+    assert df2[df2.id == 1].iloc[0].decimal_5 == Decimal((0, (1, 9, 9, 9, 9, 9), -5))
+    assert df2[df2.id == 2].iloc[0].decimal_2 is None
+    assert df2[df2.id == 2].iloc[0].decimal_5 is None
+    assert df2[df2.id == 3].iloc[0].decimal_2 == Decimal((0, (1, 9, 0), -2))
+    assert df2[df2.id == 3].iloc[0].decimal_5 == Decimal((0, (1, 9, 0, 0, 0, 0), -5))
diff --git a/testing/test_awswrangler/test_redshift.py b/testing/test_awswrangler/test_redshift.py
@@ -1,6 +1,7 @@
 import json
 import logging
 from datetime import date, datetime
+from decimal import Decimal
 
 import pytest
 import boto3
@@ -421,3 +422,90 @@ def test_connection_with_different_port_types(redshift_parameters):
         password=redshift_parameters.get("RedshiftPassword"),
     )
     conn.close()
+
+
+def test_to_redshift_pandas_decimal(session, bucket, redshift_parameters):
+    df = pd.DataFrame({
+        "id": [1, 2, 3],
+        "decimal_2": [Decimal((0, (1, 9, 9), -2)), None, Decimal((0, (1, 9, 0), -2))],
+        "decimal_5": [Decimal((0, (1, 9, 9, 9, 9, 9), -5)), None, Decimal((0, (1, 9, 0, 0, 0, 0), -5))],
+    })
+    con = Redshift.generate_connection(
+        database="test",
+        host=redshift_parameters.get("RedshiftAddress"),
+        port=redshift_parameters.get("RedshiftPort"),
+        user="test",
+        password=redshift_parameters.get("RedshiftPassword"),
+    )
+    path = f"s3://{bucket}/redshift-load/"
+    session.pandas.to_redshift(
+        dataframe=df,
+        path=path,
+        schema="public",
+        table="test",
+        connection=con,
+        iam_role=redshift_parameters.get("RedshiftRole"),
+        mode="overwrite",
+        preserve_index=False,
+    )
+    cursor = con.cursor()
+    cursor.execute("SELECT * from public.test")
+    rows = cursor.fetchall()
+    cursor.close()
+    con.close()
+    assert len(df.index) == len(rows)
+    assert len(list(df.columns)) == len(list(rows[0]))
+    print(rows)
+    for row in rows:
+        if row[0] == 1:
+            assert row[1] == Decimal((0, (1, 9, 9), -2))
+            assert row[2] == Decimal((0, (1, 9, 9, 9, 9, 9), -5))
+        elif row[1] == 2:
+            assert row[1] is None
+            assert row[2] is None
+        elif row[2] == 3:
+            assert row[1] == Decimal((0, (1, 9, 0), -2))
+            assert row[2] == Decimal((0, (1, 9, 0, 0, 0, 0), -5))
+
+
+def test_to_redshift_spark_decimal(session, bucket, redshift_parameters):
+    df = session.spark_session.createDataFrame(pd.DataFrame({
+        "id": [1, 2, 3],
+        "decimal_2": [Decimal((0, (1, 9, 9), -2)), None, Decimal((0, (1, 9, 0), -2))],
+        "decimal_5": [Decimal((0, (1, 9, 9, 9, 9, 9), -5)), None, Decimal((0, (1, 9, 0, 0, 0, 0), -5))]}),
+        schema="id INTEGER, decimal_2 DECIMAL(3,2), decimal_5 DECIMAL(6,5)")
+    con = Redshift.generate_connection(
+        database="test",
+        host=redshift_parameters.get("RedshiftAddress"),
+        port=redshift_parameters.get("RedshiftPort"),
+        user="test",
+        password=redshift_parameters.get("RedshiftPassword"),
+    )
+    path = f"s3://{bucket}/redshift-load2/"
+    session.spark.to_redshift(
+        dataframe=df,
+        path=path,
+        schema="public",
+        table="test2",
+        connection=con,
+        iam_role=redshift_parameters.get("RedshiftRole"),
+        mode="overwrite",
+    )
+    cursor = con.cursor()
+    cursor.execute("SELECT * from public.test2")
+    rows = cursor.fetchall()
+    cursor.close()
+    con.close()
+    assert df.count() == len(rows)
+    assert len(list(df.columns)) == len(list(rows[0]))
+    print(rows)
+    for row in rows:
+        if row[0] == 1:
+            assert row[1] == Decimal((0, (1, 9, 9), -2))
+            assert row[2] == Decimal((0, (1, 9, 9, 9, 9, 9), -5))
+        elif row[1] == 2:
+            assert row[1] is None
+            assert row[2] is None
+        elif row[2] == 3:
+            assert row[1] == Decimal((0, (1, 9, 0), -2))
+            assert row[2] == Decimal((0, (1, 9, 0, 0, 0, 0), -5))