2
2
3
3
> Utility belt to handle data on AWS.
4
4
5
- [ ![ Release] ( https://img.shields.io/badge/release-0.1.0 -brightgreen.svg )] ( https://pypi.org/project/awswrangler/ )
5
+ [ ![ Release] ( https://img.shields.io/badge/release-0.1.1 -brightgreen.svg )] ( https://pypi.org/project/awswrangler/ )
6
6
[ ![ Downloads] ( https://img.shields.io/pypi/dm/awswrangler.svg )] ( https://pypi.org/project/awswrangler/ )
7
7
[ ![ Python Version] ( https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg )] ( https://pypi.org/project/awswrangler/ )
8
8
[ ![ Documentation Status] ( https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest )] ( https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest )
28
28
* Pandas -> CSV (S3) (Parallel)
29
29
* Pandas -> Glue Catalog Table
30
30
* Pandas -> Athena (Parallel)
31
- * Pandas -> Redshift (Parallel)
31
+ * Pandas -> Redshift (Append/Overwrite/Upsert) ( Parallel)
32
32
* Parquet (S3) -> Pandas (Parallel) (NEW :star : )
33
33
* CSV (S3) -> Pandas (One shot or Batching)
34
34
* Glue Catalog Table -> Pandas (Parallel) (NEW :star : )
61
61
* Get EMR step state
62
62
* Get EMR step state
63
63
* Athena query to receive the result as python primitives (* Iterable[ Dict[ str, Any] * )
64
+ * Load and Unzip SageMaker jobs outputs
64
65
65
66
## Installation
66
67
@@ -84,7 +85,7 @@ Runs anywhere (AWS Lambda, AWS Glue Python Shell, EMR, EC2, on-premises, local,
84
85
import awswrangler as wr
85
86
86
87
wr.pandas.to_parquet(
87
- dataframe = dataframe ,
88
+ dataframe = df ,
88
89
database = " database" ,
89
90
path = " s3://..." ,
90
91
partition_cols = [" col_name" ],
@@ -113,7 +114,7 @@ sess.pandas.to_parquet(
113
114
``` py3
114
115
import awswrangler as wr
115
116
116
- dataframe = wr.pandas.read_sql_athena(
117
+ df = wr.pandas.read_sql_athena(
117
118
sql = " select * from table" ,
118
119
database = " database"
119
120
)
@@ -140,7 +141,7 @@ for df in df_iter:
140
141
import awswrangler as wr
141
142
142
143
sess = wr.Session(athena_ctas_approach = True )
143
- dataframe = sess.pandas.read_sql_athena(
144
+ df = sess.pandas.read_sql_athena(
144
145
sql = " select * from table" ,
145
146
database = " database"
146
147
)
@@ -151,7 +152,7 @@ dataframe = sess.pandas.read_sql_athena(
151
152
``` py3
152
153
import awswrangler as wr
153
154
154
- dataframe = wr.pandas.read_csv(path = " s3://..." )
155
+ df = wr.pandas.read_csv(path = " s3://..." )
155
156
```
156
157
157
158
#### Reading from S3 (CSV) to Pandas in chunks (For memory restrictions)
@@ -173,7 +174,7 @@ for df in df_iter:
173
174
``` py3
174
175
import awswrangler as wr
175
176
176
- dataframe = wr.pandas.read_log_query(
177
+ df = wr.pandas.read_log_query(
177
178
log_group_names = [LOG_GROUP_NAME ],
178
179
query = " fields @timestamp, @message | sort @timestamp desc | limit 5" ,
179
180
)
@@ -190,7 +191,7 @@ df = pandas.read_... # Read from anywhere
190
191
# Typical Pandas, Numpy or Pyarrow transformation HERE!
191
192
192
193
wr.pandas.to_parquet( # Storing the data and metadata to Data Lake
193
- dataframe = dataframe ,
194
+ dataframe = df ,
194
195
database = " database" ,
195
196
path = " s3://..." ,
196
197
partition_cols = [" col_name" ],
@@ -203,7 +204,7 @@ wr.pandas.to_parquet( # Storing the data and metadata to Data Lake
203
204
import awswrangler as wr
204
205
205
206
wr.pandas.to_redshift(
206
- dataframe = dataframe ,
207
+ dataframe = df ,
207
208
path = " s3://temp_path" ,
208
209
schema = " ..." ,
209
210
table = " ..." ,
@@ -219,7 +220,7 @@ wr.pandas.to_redshift(
219
220
``` py3
220
221
import awswrangler as wr
221
222
222
- dataframe = wr.pandas.read_sql_redshift(
223
+ df = wr.pandas.read_sql_redshift(
223
224
sql = " SELECT ..." ,
224
225
iam_role = " YOUR_ROLE_ARN" ,
225
226
connection = con,
@@ -268,6 +269,7 @@ sess.spark.create_glue_table(
268
269
269
270
``` py3
270
271
import awswrangler as wr
272
+
271
273
sess = awswrangler.Session(spark_session = spark)
272
274
dfs = sess.spark.flatten(dataframe = df_nested)
273
275
for name, df_flat in dfs.items():
@@ -367,6 +369,14 @@ for row in wr.athena.query(query="...", database="..."):
367
369
print (row)
368
370
```
369
371
372
+ #### Load and unzip SageMaker job output
373
+
374
+ ``` py3
375
+ import awswrangler as wr
376
+
377
+ outputs = wr.sagemaker.get_job_outputs(" s3://..." )
378
+ ```
379
+
370
380
## Diving Deep
371
381
372
382
### Parallelism, Non-picklable objects and GeoPandas
@@ -397,14 +407,14 @@ To work with null object columns you can explicitly set the expected Athena data
397
407
import awswrangler as wr
398
408
import pandas as pd
399
409
400
- dataframe = pd.DataFrame({
410
+ df = pd.DataFrame({
401
411
" col" : [1 , 2 ],
402
412
" col_string_null" : [None , None ],
403
413
" col_date_null" : [None , None ],
404
414
})
405
415
406
416
wr.pandas.to_parquet(
407
- dataframe = dataframe ,
417
+ dataframe = df ,
408
418
database = " DATABASE" ,
409
419
path = f " s3://... " ,
410
420
cast_columns = {
0 commit comments