Merge pull request #77 from awslabs/pandas-read-athena-array

igorborgest · web-flow · commit 26bcddf33389 · 2019-12-04T15:19:29.000-03:00
Fix Pandas.read_sql_athena() for arrays
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -25,6 +25,7 @@ def get_query_columns_metadata(self, query_execution_id: str) -> Dict[str, str]:
         """
         response: Dict = self._client_athena.get_query_results(QueryExecutionId=query_execution_id, MaxResults=1)
         col_info: List[Dict[str, str]] = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]
+        logger.debug(f"col_info: {col_info}")
         return {x["Name"]: x["Type"] for x in col_info}
 
     def create_athena_bucket(self):
diff --git a/awswrangler/data_types.py b/awswrangler/data_types.py
@@ -25,7 +25,7 @@ def athena2pandas(dtype: str) -> str:
     elif dtype == "date":
         return "date"
     elif dtype == "array":
-        return "literal_eval"
+        return "list"
     else:
         raise UnsupportedType(f"Unsupported Athena type: {dtype}")
 
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -418,6 +418,39 @@ def _read_csv_once(
         buff.close()
         return dataframe
 
+    @staticmethod
+    def _list_parser(value: str) -> List[Union[int, float, str, None]]:
+        if len(value) <= 1:
+            return []
+        items: List[None, str] = [None if x == "null" else x for x in value[1:-1].split(", ")]
+        array_type: Optional[type] = None
+
+        # check if all values are integers
+        for item in items:
+            if item is not None:
+                try:
+                    int(item)  # type: ignore
+                except ValueError:
+                    break
+        else:
+            array_type = int
+
+        # check if all values are floats
+        if array_type is None:
+            for item in items:
+                if item is not None:
+                    try:
+                        float(item)  # type: ignore
+                    except ValueError:
+                        break
+            else:
+                array_type = float
+
+        # check if all values are strings
+        array_type = str if array_type is None else array_type
+
+        return [array_type(x) if x is not None else None for x in items]
+
     def _get_query_dtype(self, query_execution_id: str) -> Tuple[Dict[str, str], List[str], List[str], Dict[str, Any]]:
         cols_metadata: Dict[str, str] = self._session.athena.get_query_columns_metadata(
             query_execution_id=query_execution_id)
@@ -434,15 +467,16 @@ def _get_query_dtype(self, query_execution_id: str) -> Tuple[Dict[str, str], Lis
                 parse_timestamps.append(col_name)
                 if pandas_type == "date":
                     parse_dates.append(col_name)
-            elif pandas_type == "literal_eval":
-                converters[col_name] = ast.literal_eval
+            elif pandas_type == "list":
+                converters[col_name] = Pandas._list_parser
             elif pandas_type == "bool":
                 logger.debug(f"Ignoring bool column: {col_name}")
             else:
                 dtype[col_name] = pandas_type
         logger.debug(f"dtype: {dtype}")
         logger.debug(f"parse_timestamps: {parse_timestamps}")
         logger.debug(f"parse_dates: {parse_dates}")
+        logger.debug(f"converters: {converters}")
         return dtype, parse_timestamps, parse_dates, converters
 
     def read_sql_athena(self, sql, database=None, s3_output=None, max_result_size=None, workgroup=None,
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -1285,9 +1285,10 @@ def test_to_parquet_date_null_at_first(session, bucket, database):
 def test_to_parquet_array(session, bucket, database):
     df = pd.DataFrame({
         "A": [1, 2, 3],
-        "B": [[], [4, 5, 6], []],
-        "C": [[], ["foo", "boo", "bar"], []],
-        "D": [7, 8, 9]
+        "B": [[], [4.0, None, 6.0], []],
+        "C": [[], [7, None, 9], []],
+        "D": [[], ["foo", None, "bar"], []],
+        "E": [10, 11, 12]
     })
     path = f"s3://{bucket}/test/"
     session.pandas.to_parquet(dataframe=df,
@@ -1296,3 +1297,18 @@ def test_to_parquet_array(session, bucket, database):
                               mode="overwrite",
                               preserve_index=False,
                               procs_cpu_bound=1)
+    df2 = None
+    for counter in range(10):  # Retrying to workaround s3 eventual consistency
+        sleep(1)
+        df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
+        if len(df.index) == len(df2.index):
+            break
+    print(df2)
+    session.s3.delete_objects(path=path)
+
+    assert len(list(df.columns)) == len(list(df2.columns))
+    assert len(df.index) == len(df2.index)
+
+    assert df2[df2.a == 2].iloc[0].b[0] == 4.0
+    assert df2[df2.a == 2].iloc[0].c[0] == 7
+    assert df2[df2.a == 2].iloc[0].d[0] == "foo"