Skip to content

Commit 75052fc

Browse files
committed
Adding Spark-to-redshift flow on README.
1 parent a0c9b83 commit 75052fc

File tree

4 files changed

+44
-5
lines changed

4 files changed

+44
-5
lines changed

README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* CSV (S3) -> Pandas
1919
* Athena -> Pandas
2020
* PySpark -> Redshift
21+
* Delete S3 objects (parallel :rocket:)
2122

2223
## Installation
2324

@@ -55,13 +56,38 @@ dataframe = session.pandas.read_sql_athena(
5556
)
5657
```
5758

59+
### Reading from AWS Athena to Pandas in chunks (For memory restrictions)
60+
61+
```py3
62+
session = awswrangler.Session()
63+
dataframe_iter = session.pandas.read_sql_athena(
64+
sql="select * from table",
65+
database="database",
66+
max_result_size=512_000_000 # 512 MB
67+
)
68+
for dataframe in dataframe_iter:
69+
print(dataframe) # Do whatever you want
70+
```
71+
5872
### Reading from S3 (CSV) to Pandas
5973

6074
```py3
6175
session = awswrangler.Session()
6276
dataframe = session.pandas.read_csv(path="s3://...")
6377
```
6478

79+
### Reading from S3 (CSV) to Pandas in chunks (For memory restrictions)
80+
81+
```py3
82+
session = awswrangler.Session()
83+
dataframe_iter = session.pandas.read_csv(
84+
path="s3://...",
85+
max_result_size=512_000_000 # 512 MB
86+
)
87+
for dataframe in dataframe_iter:
88+
print(dataframe) # Do whatever you want
89+
```
90+
6591
### Typical Pandas ETL
6692

6793
```py3
@@ -96,8 +122,19 @@ session.spark.to_redshift(
96122
)
97123
```
98124

125+
### Deleting a bunch of S3 objects
126+
127+
```py3
128+
session = awswrangler.Session()
129+
session.s3.delete_objects(path="s3://...")
130+
```
131+
99132
## Diving Deep
100133

101134
### Pandas to Redshift Flow
102135

103136
![Pandas to Redshift Flow](docs/pandas-to-redshift/pandas-to-redshift-flow.jpg?raw=true "Pandas to Redshift Flow")
137+
138+
### Spark to Redshift Flow
139+
140+
![Spark to Redshift Flow](docs/spark-to-redshift/spark-to-redshift-flow.jpg?raw=true "Spark to Redshift Flow")

awswrangler/spark.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,23 +54,23 @@ def to_redshift(
5454
:return: None
5555
"""
5656
logger.debug(f"Minimum number of partitions : {min_num_partitions}")
57+
if path[-1] != "/":
58+
path += "/"
5759
self._session.s3.delete_objects(path=path)
58-
num_slices = self._session.redshift.get_number_of_slices(
59-
redshift_conn=connection)
60-
logger.debug(f"Number of slices on Redshift: {num_slices}")
6160
spark = self._session.spark_session
6261
dataframe.cache()
6362
num_rows = dataframe.count()
6463
logger.info(f"Number of rows: {num_rows}")
6564
if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
6665
num_partitions = 1
6766
else:
67+
num_slices = self._session.redshift.get_number_of_slices(
68+
redshift_conn=connection)
69+
logger.debug(f"Number of slices on Redshift: {num_slices}")
6870
num_partitions = num_slices
6971
while num_partitions < min_num_partitions:
7072
num_partitions += num_slices
7173
logger.debug(f"Number of partitions calculated: {num_partitions}")
72-
if path[-1] != "/":
73-
path += "/"
7474
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
7575
session_primitives = self._session.primitives
7676

111 KB
Loading
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<mxfile modified="2019-08-13T19:32:58.390Z" host="www.draw.io" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" etag="n4OLEefPhGmP99CLax4e" version="11.1.4" type="google"><diagram id="uhJXyVCDerINJ9iByMH9" name="Page-1">7V1dc5u6Fv01mTnnIRmQQODHJE6azG2b3DidtE8ZGWSbFoMLcmOfX38lgfgSdkhjPnp885AYIYTQXkt77yXhnMDL5eZDhFeLT6FL/BOguZsTOD4BYGQg9psXbJMCZNhJwTzy3KRIzwsm3j8kLdTS0rXnkrhUkYahT71VudAJg4A4tFSGoyh8KVebhX75ris8J0rBxMG+WvrkuXSRlNrAystviDdfyDvraJScWWJZOX2SeIHd8KVQBK9O4GUUhjT5tNxcEp+PnRyX5LrrHWezjkUkoE0uCG7u9TXW7+6/fwH38eeb8Yw+n+og7RzdyicmLhuA9DAIA/bnIgrXgUt4Oxo7CiO6COdhgP2PYbhihTor/E4o3abmw2sasqIFXfrpWdbFaPuVX39mysNvaXPiYLwpHW3To5jiiJ5zM+adEWXXnu+nddRxSIcmDteRQ/Y8vJHiCUdzQvcNUtogH5nCHdJh/kDCJWGdZhUi4mPq/SpDB6cInGf1ciOxD6md3mCztNu/sL9O7/SBdR9owXo5JRH7EM740/uew4gDkM8e7WLKytGcf/ICdvKBuPHCm/GrLv11TPllFRiUbf6y8CiZrLAYzxfG9LJ9d9rgF4ko2ewdtPQsAhwb4qJ0qoBmOlW85MTTJZsWBdIhraWRtjohR7sgNxuCHA0K46aC8THxCSW8hwxTW7rwgjlvJIiZf+DPtyCicw5hfXNV2FOyXIURFo8wgWIsGab6Bj0wy5AHuqVAHlg1kIdGSwOPjCN2CKghV+xBcQUpXLlk5qEZKcp+QYREnDjyfJUpDPDRD3ZyjCmeRXjJm/jryeNs0S6xsyB/908bCX9JG1ulDZReu0gbMGrJBjJcLRghG1k5sLJgFgrj5OOHfq5DeeI0FpQ5ZxV0c7XJT8pWPoeyHdbRpKly86y4cMsuuLzx6NfC5285rdlRTmR+0A2Pm/JzW841GtM1bek+9IQlZQSjj6oRjG2XG0lmlvS6Yqwum5IVw9ksJlSBZdb3dyDV7hQU2htAMUynYDd0Crp5aK8gLmXPhbeFCiuOlHg3CvWdKMyBlLR5UFiBzibAb3lW8zszIHMxtAzViLBb4qmowKGSjjCrbV6cmGNWgn1vHrACh4GHeVJ4wV2V52D/PD2x9FyXX3/h4ynxL7DzYy4YdBn6YSTuC2fipxaJe3mqeMVM60i7fFKUE+q85al2ZowM6yBz3CkoQSvTD9qft+yd+FpFZAfCZtgpA+yG+L8It1wRVg2ACDgQk2lEm2bWPXUS8/Iq0Xz6l4jmmS3YA2qVz3/XOPKasKwai7G/eMnjJ7/4dPn5T7efnz9/+XRx9fB8d/38cPc0eX68ex7fTh4fbi++PF7VtpiVXNcVKlTawbDSqFfjwUW4nK7jbmJBdb6Dqm4AjZokStfaEg5gN8rBMB3mqKHDhMOS1UbKBPMUeUJx4AMEtHsc/VwLoW3m+bxgJdjLRoh61At5PoUDl/2ei0p8mshSMKE4/CBbyXbC8qi0obP9HE1K7s8mrJ72xNtyML9VENJkZESbvMT3YnqKZ8xBnb5kHef24vdmyFE65sVJp+Kz/hM6WytR2DBrdJA66c9ui8E67ITB7TJRpsCvUnE0KCbKbheoWHaW95JzdS5T+PGqn/QCb4n5jT+t+dqR4K9oqk4imaTS+eAUcQOptOhWEde7dmwlt5Z7uV4cm95USx8YnVQx/Z100hUQdM0Nsxr09U8NgLqlhj4kagDQkBrDEEmQrZfQAyCo2L8NiQQoPDx33VikiXlyeF1KTXOZXiM8uKwyUfA3WcpK4svB+Sxdg30T0+yGmFLxzFTOb4Uzf5LiCWBfZH6fndUM7v8S5HskSEmcg0iQKabeqz8qao8EYfsKJDjm2LfxnCC93FAmhd268aAWZv+YOcE64Jygm6gcKxxkjqhGIFkC1MEUAXeiLV4J3U4FlwjKTuMkKuPwCsJIaBVVgI1D7na0HTuZqpHhAvPK+C3RJbwuQDXp7xAE/lF1yrdq9gXadfq+3pq+39Ei+iDDQ9hUVYRwUK4AqrJisnE2YYCY31PNXPCK51ZVVmFxG66vU8IrcgW/f6UQgipFzKZ7Z2FbO6IMVSQ6Ioo0jZbgsFIoqPovuQiWkGSJA29GYnr2PRZEydnykuwQrBIknH5nMYFwRWJxjDXjRfJ5+EZdwbv+KQRso0IhJMOG4hqU3iWFTE0ZliOiUFOx3RiW2g5Vtf1YKIQqBLKQSiCzhkBGW2Ga0ZHyN8iM3WgqyRvGoAhkqPvZW8rY7xi2060S//7E3TikmGcYUmx9Z64OTCVZB+U22kvWzc6kofPVqrB6M5h9+4P0+42nLbOp0FjQoEeaVUab+c6t/+2jVI5H8fWeBSnsLft0N75SfX+HAg0E5c1bll2z4le3/9JoLfc8ZnlGuvPX/b41LL+vvtc8fri7ZyWP5xcfGcS122v26+rr7eRx0nuwi7QK6EeZJyvCHtW9hNbWu5tmxwtUg9qBItF8OL/REezlgkrx9c2AAzFLBeUbmS6mXM+n21WyEMD39Bbe5K+cTpJFJyK4kHwmQV6adObX9kwmE1XXbGzTqKETqPMisK3s0TxmBVP65te9yMDopG4CmfBBkYthNMJBjJ10w/6waFAVUCzYUEDR2wqk0DFTQDqK1z3KsBRI2e8CBT6G2M18QLpmNTjwVwV4W+4I7A/+4Jjhf3BU10tALCkvmX1kVuLjHS/NH0wGUj1GcVn455qIp7wd904RCMvqRaZmFDMODbVDELy5uTTcDUSrr1c/R/95+Oe/6Px0Pz9yKlzlpf3R5fepgGqWc2vHo7c3Gmt7s/+FhNat87bssAPrDGudA3WxV6w0hfksChDrhfW7WAa6/asSFGfu4bWoAJltGe6Y1UXUdFkeDUtdROqyfPKNcnHGh1JcXHhvu7jhchjfGoeqr9hYo74jZVXFamkt7YHQdSS2TKwdh8TxbO3zd+B/f2ntTdZjGPELi7CuSWzXEASLwh+kcMYGU4jQoeI+ULa3rqv21uV2pdIKS2t06uxdm8cFn8nyRJWnsldsdov43LJxyCqRcvqx/8x2iOPU2X9qm4a5d6p9w1qDXs6QGN+Bav9aH9gW361uhKF2fZn8Ys2uM1zLKrfQcoIrH7N9rl4KR/qvnopRlYoKEVHdGoW1GzPvSvH2f0vJn5iA78urXw86e9tOva/bpaW95dIT4lJxHaJvXQlY6kvbNREGqIkwWpOW9m/dbAnZv4/Qg2sL9a4E2VbFUFUXfzhnsu85C5B+CH1ffqfUgECNQHU9YWTVfUXIYTDNDvP/5JCMdf7vMODV/wA=</diagram></mxfile>

0 commit comments

Comments
 (0)