Bumping version to 0.0.17

igorborgest · igorborgest · commit 2e520440801f · 2019-10-31T13:26:11.000-03:00
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 > Utility belt to handle data on AWS.
 
-[![Release](https://img.shields.io/badge/release-0.0.16-brightgreen.svg)](https://pypi.org/project/awswrangler/)
+[![Release](https://img.shields.io/badge/release-0.0.17-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
 [![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
 [![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)
diff --git a/awswrangler/__version__.py b/awswrangler/__version__.py
@@ -1,4 +1,4 @@
 __title__ = "awswrangler"
 __description__ = "Utility belt to handle data on AWS."
-__version__ = "0.0.16"
+__version__ = "0.0.17"
 __license__ = "Apache License 2.0"
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -1,11 +1,10 @@
 from typing import Dict, List, Tuple, Optional, Any, Iterator
 from time import sleep
 import logging
-import ast
 import re
 import unicodedata
 
-from awswrangler.data_types import athena2python, athena2pandas
+from awswrangler.data_types import athena2python
 from awswrangler.exceptions import QueryFailed, QueryCancelled
 
 logger = logging.getLogger(__name__)
@@ -18,33 +17,16 @@ def __init__(self, session):
         self._session = session
         self._client_athena = session.boto3_session.client(service_name="athena", config=session.botocore_config)
 
-    def get_query_columns_metadata(self, query_execution_id):
-        response = self._client_athena.get_query_results(QueryExecutionId=query_execution_id, MaxResults=1)
-        col_info = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]
+    def get_query_columns_metadata(self, query_execution_id: str) -> Dict[str, str]:
+        """
+        Get the data type of all columns queried
+        :param query_execution_id: Athena query execution ID
+        :return: Dictionary with all data types
+        """
+        response: Dict = self._client_athena.get_query_results(QueryExecutionId=query_execution_id, MaxResults=1)
+        col_info: List[Dict[str, str]] = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]
         return {x["Name"]: x["Type"] for x in col_info}
 
-    def get_query_dtype(self, query_execution_id):
-        cols_metadata = self.get_query_columns_metadata(query_execution_id=query_execution_id)
-        logger.debug(f"cols_metadata: {cols_metadata}")
-        dtype = {}
-        parse_timestamps = []
-        parse_dates = []
-        converters = {}
-        for col_name, col_type in cols_metadata.items():
-            pandas_type = athena2pandas(dtype=col_type)
-            if pandas_type in ["datetime64", "date"]:
-                parse_timestamps.append(col_name)
-                if pandas_type == "date":
-                    parse_dates.append(col_name)
-            elif pandas_type == "literal_eval":
-                converters[col_name] = ast.literal_eval
-            else:
-                dtype[col_name] = pandas_type
-        logger.debug(f"dtype: {dtype}")
-        logger.debug(f"parse_timestamps: {parse_timestamps}")
-        logger.debug(f"parse_dates: {parse_dates}")
-        return dtype, parse_timestamps, parse_dates, converters
-
     def create_athena_bucket(self):
         """
         Creates the default Athena bucket if not exists
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -6,6 +6,7 @@
 import copy
 import csv
 from datetime import datetime
+import ast
 
 import pandas as pd  # type: ignore
 import pyarrow as pa  # type: ignore
@@ -416,6 +417,33 @@ def _read_csv_once(
         buff.close()
         return dataframe
 
+    def _get_query_dtype(self, query_execution_id: str) -> Tuple[Dict[str, str], List[str], List[str], Dict[str, Any]]:
+        cols_metadata: Dict[str, str] = self._session.athena.get_query_columns_metadata(
+            query_execution_id=query_execution_id)
+        logger.debug(f"cols_metadata: {cols_metadata}")
+        dtype: Dict[str, str] = {}
+        parse_timestamps: List[str] = []
+        parse_dates: List[str] = []
+        converters: Dict[str, Any] = {}
+        col_name: str
+        col_type: str
+        for col_name, col_type in cols_metadata.items():
+            pandas_type: str = data_types.athena2pandas(dtype=col_type)
+            if pandas_type in ["datetime64", "date"]:
+                parse_timestamps.append(col_name)
+                if pandas_type == "date":
+                    parse_dates.append(col_name)
+            elif pandas_type == "literal_eval":
+                converters[col_name] = ast.literal_eval
+            elif pandas_type == "bool":
+                logger.debug(f"Ignoring bool column: {col_name}")
+            else:
+                dtype[col_name] = pandas_type
+        logger.debug(f"dtype: {dtype}")
+        logger.debug(f"parse_timestamps: {parse_timestamps}")
+        logger.debug(f"parse_dates: {parse_dates}")
+        return dtype, parse_timestamps, parse_dates, converters
+
     def read_sql_athena(self, sql, database, s3_output=None, max_result_size=None):
         """
         Executes any SQL query on AWS Athena and return a Dataframe of the result.
@@ -436,7 +464,7 @@ def read_sql_athena(self, sql, database, s3_output=None, max_result_size=None):
             message_error = f"Query error: {reason}"
             raise AthenaQueryError(message_error)
         else:
-            dtype, parse_timestamps, parse_dates, converters = self._session.athena.get_query_dtype(
+            dtype, parse_timestamps, parse_dates, converters = self._get_query_dtype(
                 query_execution_id=query_execution_id)
             path = f"{s3_output}{query_execution_id}.csv"
             ret = self.read_csv(path=path,
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -921,3 +921,31 @@ def test_to_parquet_casting_with_null_object(
                                          database=database,
                                          path=f"s3://{bucket}/test/",
                                          mode="overwrite")
+
+
+def test_read_sql_athena_with_nulls(session, bucket, database):
+    df = pd.DataFrame({"col_int": [1, None, 3], "col_bool": [True, False, False], "col_bool_null": [True, None, False]})
+    path = f"s3://{bucket}/test/"
+    session.pandas.to_parquet(dataframe=df,
+                              database=database,
+                              path=path,
+                              preserve_index=False,
+                              mode="overwrite",
+                              cast_columns={
+                                  "col_int": "int",
+                                  "col_bool_null": "boolean"
+                              })
+    df2 = None
+    for counter in range(10):
+        df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
+        assert len(list(df.columns)) == len(list(df2.columns))
+        if len(df.index) == len(df2.index):
+            break
+        sleep(1)
+    assert len(df.index) == len(df2.index)
+    print(df2)
+    print(df2.dtypes)
+    assert df2.dtypes[0] == "Int64"
+    assert df2.dtypes[1] == "bool"
+    assert df2.dtypes[2] == "object"
+    session.s3.delete_objects(path=path)