Skip to content

Commit 34dbdd7

Browse files
committed
Bumping version to 0.2.5
1 parent eb06a51 commit 34dbdd7

File tree

7 files changed

+130
-73
lines changed

7 files changed

+130
-73
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
![AWS Data Wrangler](docs/source/_static/logo.png?raw=true "AWS Data Wrangler")
22

3-
> Utility belt to handle data on AWS.
3+
> DataFrames on AWS.
44
5-
[![Release](https://img.shields.io/badge/release-0.2.2-brightgreen.svg)](https://pypi.org/project/awswrangler/)
5+
[![Release](https://img.shields.io/badge/release-0.2.5-brightgreen.svg)](https://pypi.org/project/awswrangler/)
66
[![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
77
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
88
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)

awswrangler/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
__title__ = "awswrangler"
2-
__description__ = "Utility belt to handle data on AWS."
3-
__version__ = "0.2.2"
2+
__description__ = "DataFrames on AWS."
3+
__version__ = "0.2.5"
44
__license__ = "Apache License 2.0"

awswrangler/pandas.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1247,7 +1247,6 @@ def to_redshift(
12471247
generated_conn = True
12481248

12491249
try:
1250-
12511250
if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
12521251
num_partitions: int = 1
12531252
else:
@@ -1558,7 +1557,7 @@ def read_sql_redshift(self,
15581557
15591558
:param sql: SQL Query
15601559
:param iam_role: AWS IAM role with the related permissions
1561-
:param connection: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
1560+
:param connection: Glue connection name (str) OR a PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
15621561
:param temp_s3_path: AWS S3 path to write temporary data (e.g. s3://...) (Default uses the Athena's results bucket)
15631562
:param procs_cpu_bound: Number of cores used for CPU bound tasks
15641563
"""
@@ -1574,21 +1573,34 @@ def read_sql_redshift(self,
15741573
logger.debug(f"temp_s3_path: {temp_s3_path}")
15751574
self._session.s3.delete_objects(path=temp_s3_path)
15761575
paths: Optional[List[str]] = None
1576+
1577+
generated_conn: bool = False
1578+
if type(connection) == str:
1579+
logger.debug("Glue connection (str) provided.")
1580+
connection = self._session.glue.get_connection(name=connection)
1581+
generated_conn = True
1582+
15771583
try:
15781584
paths = self._session.redshift.to_parquet(sql=sql,
15791585
path=temp_s3_path,
15801586
iam_role=iam_role,
15811587
connection=connection)
15821588
logger.debug(f"paths: {paths}")
15831589
df: pd.DataFrame = self.read_parquet(path=paths, procs_cpu_bound=procs_cpu_bound) # type: ignore
1584-
self._session.s3.delete_listed_objects(objects_paths=paths + [temp_s3_path + "/manifest"]) # type: ignore
1585-
return df
1586-
except Exception as e:
1590+
except Exception as ex:
1591+
connection.rollback()
15871592
if paths is not None:
15881593
self._session.s3.delete_listed_objects(objects_paths=paths + [temp_s3_path + "/manifest"])
15891594
else:
15901595
self._session.s3.delete_objects(path=temp_s3_path)
1591-
raise e
1596+
if generated_conn is True:
1597+
connection.close()
1598+
raise ex
1599+
1600+
if generated_conn is True:
1601+
connection.close()
1602+
self._session.s3.delete_listed_objects(objects_paths=paths + [temp_s3_path + "/manifest"]) # type: ignore
1603+
return df
15921604

15931605
def to_aurora(self,
15941606
dataframe: pd.DataFrame,

awswrangler/spark.py

Lines changed: 74 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def to_redshift(
7171
7272
:param dataframe: Pandas Dataframe
7373
:param path: S3 path to write temporary files (E.g. s3://BUCKET_NAME/ANY_NAME/)
74-
:param connection: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
74+
:param connection: Glue connection name (str) OR a PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
7575
:param schema: The Redshift Schema for the table
7676
:param table: The name of the desired Redshift table
7777
:param iam_role: AWS IAM role with the related permissions
@@ -93,68 +93,83 @@ def to_redshift(
9393
dataframe.cache()
9494
num_rows: int = dataframe.count()
9595
logger.info(f"Number of rows: {num_rows}")
96-
num_partitions: int
97-
if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
98-
num_partitions = 1
99-
else:
100-
num_slices: int = self._session.redshift.get_number_of_slices(redshift_conn=connection)
101-
logger.debug(f"Number of slices on Redshift: {num_slices}")
102-
num_partitions = num_slices
103-
while num_partitions < min_num_partitions:
104-
num_partitions += num_slices
105-
logger.debug(f"Number of partitions calculated: {num_partitions}")
106-
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
107-
session_primitives = self._session.primitives
108-
par_col_name: str = "aws_data_wrangler_internal_partition_id"
10996

110-
@pandas_udf(returnType="objects_paths string", functionType=PandasUDFType.GROUPED_MAP)
111-
def write(pandas_dataframe: pd.DataFrame) -> pd.DataFrame:
112-
# Exporting ARROW_PRE_0_15_IPC_FORMAT environment variable for
113-
# a temporary workaround while waiting for Apache Arrow updates
114-
# https://stackoverflow.com/questions/58273063/pandasudf-and-pyarrow-0-15-0
115-
os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
97+
generated_conn: bool = False
98+
if type(connection) == str:
99+
logger.debug("Glue connection (str) provided.")
100+
connection = self._session.glue.get_connection(name=connection)
101+
generated_conn = True
116102

117-
del pandas_dataframe[par_col_name]
118-
paths: List[str] = session_primitives.session.pandas.to_parquet(dataframe=pandas_dataframe,
119-
path=path,
120-
preserve_index=False,
121-
mode="append",
122-
procs_cpu_bound=1,
123-
procs_io_bound=1,
124-
cast_columns=casts)
125-
return pd.DataFrame.from_dict({"objects_paths": paths})
103+
try:
104+
num_partitions: int
105+
if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
106+
num_partitions = 1
107+
else:
108+
num_slices: int = self._session.redshift.get_number_of_slices(redshift_conn=connection)
109+
logger.debug(f"Number of slices on Redshift: {num_slices}")
110+
num_partitions = num_slices
111+
while num_partitions < min_num_partitions:
112+
num_partitions += num_slices
113+
logger.debug(f"Number of partitions calculated: {num_partitions}")
114+
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
115+
session_primitives = self._session.primitives
116+
par_col_name: str = "aws_data_wrangler_internal_partition_id"
126117

127-
df_objects_paths: DataFrame = dataframe.repartition(numPartitions=num_partitions) # type: ignore
128-
df_objects_paths: DataFrame = df_objects_paths.withColumn(par_col_name, spark_partition_id()) # type: ignore
129-
df_objects_paths: DataFrame = df_objects_paths.groupby(par_col_name).apply(write) # type: ignore
118+
@pandas_udf(returnType="objects_paths string", functionType=PandasUDFType.GROUPED_MAP)
119+
def write(pandas_dataframe: pd.DataFrame) -> pd.DataFrame:
120+
# Exporting ARROW_PRE_0_15_IPC_FORMAT environment variable for
121+
# a temporary workaround while waiting for Apache Arrow updates
122+
# https://stackoverflow.com/questions/58273063/pandasudf-and-pyarrow-0-15-0
123+
os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
130124

131-
objects_paths: List[str] = list(df_objects_paths.toPandas()["objects_paths"])
132-
dataframe.unpersist()
133-
num_files_returned: int = len(objects_paths)
134-
if num_files_returned != num_partitions:
135-
raise MissingBatchDetected(f"{num_files_returned} files returned. {num_partitions} expected.")
136-
logger.debug(f"List of objects returned: {objects_paths}")
137-
logger.debug(f"Number of objects returned from UDF: {num_files_returned}")
138-
manifest_path: str = f"{path}manifest.json"
139-
self._session.redshift.write_load_manifest(manifest_path=manifest_path,
140-
objects_paths=objects_paths,
141-
procs_io_bound=self._procs_io_bound)
142-
self._session.redshift.load_table(dataframe=dataframe,
143-
dataframe_type="spark",
144-
manifest_path=manifest_path,
145-
schema_name=schema,
146-
table_name=table,
147-
redshift_conn=connection,
148-
preserve_index=False,
149-
num_files=num_partitions,
150-
iam_role=iam_role,
151-
diststyle=diststyle,
152-
distkey=distkey,
153-
sortstyle=sortstyle,
154-
sortkey=sortkey,
155-
mode=mode,
156-
cast_columns=casts)
157-
self._session.s3.delete_objects(path=path, procs_io_bound=self._procs_io_bound)
125+
del pandas_dataframe[par_col_name]
126+
paths: List[str] = session_primitives.session.pandas.to_parquet(dataframe=pandas_dataframe,
127+
path=path,
128+
preserve_index=False,
129+
mode="append",
130+
procs_cpu_bound=1,
131+
procs_io_bound=1,
132+
cast_columns=casts)
133+
return pd.DataFrame.from_dict({"objects_paths": paths})
134+
135+
df_objects_paths: DataFrame = dataframe.repartition(numPartitions=num_partitions) # type: ignore
136+
df_objects_paths = df_objects_paths.withColumn(par_col_name, spark_partition_id()) # type: ignore
137+
df_objects_paths = df_objects_paths.groupby(par_col_name).apply(write) # type: ignore
138+
139+
objects_paths: List[str] = list(df_objects_paths.toPandas()["objects_paths"])
140+
dataframe.unpersist()
141+
num_files_returned: int = len(objects_paths)
142+
if num_files_returned != num_partitions:
143+
raise MissingBatchDetected(f"{num_files_returned} files returned. {num_partitions} expected.")
144+
logger.debug(f"List of objects returned: {objects_paths}")
145+
logger.debug(f"Number of objects returned from UDF: {num_files_returned}")
146+
manifest_path: str = f"{path}manifest.json"
147+
self._session.redshift.write_load_manifest(manifest_path=manifest_path,
148+
objects_paths=objects_paths,
149+
procs_io_bound=self._procs_io_bound)
150+
self._session.redshift.load_table(dataframe=dataframe,
151+
dataframe_type="spark",
152+
manifest_path=manifest_path,
153+
schema_name=schema,
154+
table_name=table,
155+
redshift_conn=connection,
156+
preserve_index=False,
157+
num_files=num_partitions,
158+
iam_role=iam_role,
159+
diststyle=diststyle,
160+
distkey=distkey,
161+
sortstyle=sortstyle,
162+
sortkey=sortkey,
163+
mode=mode,
164+
cast_columns=casts)
165+
self._session.s3.delete_objects(path=path, procs_io_bound=self._procs_io_bound)
166+
except Exception as ex:
167+
connection.rollback()
168+
if generated_conn is True:
169+
connection.close()
170+
raise ex
171+
if generated_conn is True:
172+
connection.close()
158173

159174
def create_glue_table(self,
160175
database,

docs/source/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
:alt: alternate text
99
:figclass: align-center
1010

11-
*Utility belt to handle data on AWS.*
11+
*DataFrames on AWS.*
1212

1313
`Read the Tutorials <https://github.com/awslabs/aws-data-wrangler/tree/master/tutorials>`_: `Catalog & Metadata <https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/catalog_and_metadata.ipynb>`_ | `Athena Nested <https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/athena_nested.ipynb>`_ | `S3 Write Modes <https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/s3_write_modes.ipynb>`_
1414

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
numpy~=1.18.1
22
pandas~=0.25.3
33
pyarrow~=0.15.1
4-
botocore~=1.14.1
5-
boto3~=1.11.1
4+
botocore~=1.14.2
5+
boto3~=1.11.2
66
s3fs~=0.4.0
77
tenacity~=6.0.0
88
pg8000~=1.13.2

testing/test_awswrangler/test_redshift.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ def test_to_redshift_spark_bool(session, bucket, redshift_parameters):
347347
session.spark.to_redshift(
348348
dataframe=dataframe,
349349
path=f"s3://{bucket}/redshift-load-bool/",
350-
connection=con,
350+
connection="aws-data-wrangler-redshift",
351351
schema="public",
352352
table="test",
353353
iam_role=redshift_parameters.get("RedshiftRole"),
@@ -722,3 +722,33 @@ def test_to_redshift_pandas_upsert(session, bucket, redshift_parameters):
722722

723723
wr.s3.delete_objects(path=f"s3://{bucket}/")
724724
con.close()
725+
726+
727+
@pytest.mark.parametrize("sample_name", ["micro", "small", "nano"])
728+
def test_read_sql_redshift_pandas_glue_conn(session, bucket, redshift_parameters, sample_name):
729+
if sample_name == "micro":
730+
dates = ["date"]
731+
elif sample_name == "small":
732+
dates = ["date"]
733+
else:
734+
dates = ["date", "time"]
735+
df = pd.read_csv(f"data_samples/{sample_name}.csv", parse_dates=dates, infer_datetime_format=True)
736+
df["date"] = df["date"].dt.date
737+
path = f"s3://{bucket}/test_read_sql_redshift_pandas_glue_conn/"
738+
session.pandas.to_redshift(
739+
dataframe=df,
740+
path=path,
741+
schema="public",
742+
table="test",
743+
connection="aws-data-wrangler-redshift",
744+
iam_role=redshift_parameters.get("RedshiftRole"),
745+
mode="overwrite",
746+
preserve_index=True,
747+
)
748+
path2 = f"s3://{bucket}/test_read_sql_redshift_pandas_glue_conn2/"
749+
df2 = session.pandas.read_sql_redshift(sql="select * from public.test",
750+
iam_role=redshift_parameters.get("RedshiftRole"),
751+
connection="aws-data-wrangler-redshift",
752+
temp_s3_path=path2)
753+
assert len(df.index) == len(df2.index)
754+
assert len(df.columns) + 1 == len(df2.columns)

0 commit comments

Comments
 (0)