Skip to content

Commit 67d40c1

Browse files
authored
fix: replace amazon-reviews with ursa-labs public S3 bucket (#2460)
1 parent e19e987 commit 67d40c1

File tree

3 files changed

+16
-30
lines changed

3 files changed

+16
-30
lines changed

tests/glue_scripts/wrangler_blog_simple.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,37 +7,33 @@
77
glue_database = os.environ["glue-database"]
88
glue_table = os.environ["glue-table"]
99

10-
category = "toys"
10+
# Read 1.5 Gb Parquet data
11+
df = wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2017/")
1112

12-
# Read Parquet data (1.2 Gb parquet compressed)
13-
df = wr.s3.read_parquet(
14-
path=f"s3://amazon-reviews-pds/parquet/product_category={category.title()}/",
15-
)
16-
17-
# Drop customer_id column
18-
df.drop("customer_id", axis=1, inplace=True)
13+
# Drop vendor_id column
14+
df.drop("vendor_id", axis=1, inplace=True)
1915

20-
# Filter reviews with 5-star rating
21-
df5 = df[df["star_rating"] == 5]
16+
# Filter trips with 1 passenger
17+
df1 = df[df["trip_distance"] > 1]
2218

23-
# Write partitioned five stars reviews to S3 in Parquet format
19+
# Write partitioned trips to S3 in Parquet format
2420
wr.s3.to_parquet(
25-
df5,
26-
path=f"{output_path}output/{category}/",
27-
partition_cols=["year", "marketplace"],
21+
df1,
22+
path=f"{output_path}output/{glue_table}/",
23+
partition_cols=["passenger_count", "payment_type"],
2824
dataset=True,
2925
database=glue_database,
3026
table=glue_table,
3127
)
3228

3329
# Read the data back to a modin df via Athena
34-
df5_athena = wr.athena.read_sql_query(
30+
df1_athena = wr.athena.read_sql_query(
3531
f"SELECT * FROM {glue_table}",
3632
database=glue_database,
3733
ctas_approach=False,
3834
unload_approach=True,
3935
workgroup=workgroup_name,
40-
s3_output=f"{output_path}unload/{category}/",
36+
s3_output=f"{output_path}unload/{glue_table}/",
4137
)
4238

4339
# Delete table (required due to LF)

tests/load/test_databases.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,15 +109,15 @@ def test_redshift_copy_unload(
109109

110110
@pytest.mark.parametrize("benchmark_time", [40])
111111
def test_athena_unload(benchmark_time: int, path: str, glue_table: str, glue_database: str, request) -> None:
112-
df = wr.s3.read_parquet(path="s3://amazon-reviews-pds/parquet/product_category=Toys/", dataset=True)
112+
df = wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2017/", dataset=True)
113113

114114
wr.s3.to_parquet(
115115
df,
116116
path,
117117
dataset=True,
118118
table=glue_table,
119119
database=glue_database,
120-
partition_cols=["year", "marketplace"],
120+
partition_cols=["passenger_count", "payment_type"],
121121
)
122122

123123
with ExecutionTimer(request) as timer:
@@ -136,7 +136,7 @@ def test_athena_unload(benchmark_time: int, path: str, glue_table: str, glue_dat
136136

137137
@pytest.mark.parametrize("benchmark_time", [80])
138138
def test_lakeformation_read(benchmark_time: int, path: str, glue_table: str, glue_database: str, request) -> None:
139-
df = wr.s3.read_parquet(path="s3://amazon-reviews-pds/parquet/product_category=Home/", dataset=True)
139+
df = wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2017/", dataset=True)
140140

141141
wr.s3.to_parquet(
142142
df,
@@ -145,7 +145,7 @@ def test_lakeformation_read(benchmark_time: int, path: str, glue_table: str, glu
145145
dataset=True,
146146
table=glue_table,
147147
database=glue_database,
148-
partition_cols=["year", "marketplace"],
148+
partition_cols=["passenger_count", "payment_type"],
149149
glue_table_settings={
150150
"table_type": "GOVERNED",
151151
},

tests/load/test_s3.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -96,16 +96,6 @@ def test_s3_read_parquet_many_files(
9696
assert timer.elapsed_time < benchmark_time
9797

9898

99-
@pytest.mark.parametrize("benchmark_time", [40])
100-
def test_s3_read_parquet_partition_filter(benchmark_time: float, request: pytest.FixtureRequest) -> None:
101-
path = "s3://amazon-reviews-pds/parquet/"
102-
with ExecutionTimer(request, data_paths=path) as timer:
103-
filter = lambda x: True if x["product_category"].startswith("Wireless") else False # noqa: E731
104-
wr.s3.read_parquet(path=path, dataset=True, partition_filter=filter)
105-
106-
assert timer.elapsed_time < benchmark_time
107-
108-
10999
@pytest.mark.parametrize("benchmark_time", [5])
110100
@pytest.mark.parametrize("path_suffix", [None, "df.parquet"])
111101
def test_s3_write_parquet_simple(

0 commit comments

Comments
 (0)