Skip to content

Commit 434ba32

Browse files
committed
Add pandas versus polars benchmark
1 parent ed5070f commit 434ba32

File tree

2 files changed

+189
-0
lines changed

2 files changed

+189
-0
lines changed

source-code/polars/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,6 @@ Polars is an alternative to pandas that is designed to have better performance.
88
1. `patient_data.ipynb`: Jupyter notebook that explores functional differences
99
between pandas and polars. It replicates the notebook in the `pandas`
1010
directory with the same name.
11+
1. `polars_versus_pandas_benchmarks.ipynb`: Jupyter notebook that compares the
12+
performance of polars and pandas on a variety of operations.
1113
1. `data`: Directory containing the data used in the notebook.
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "c9f1da30-f441-4aaa-b91c-c1b6cf3b934d",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import datetime\n",
11+
"import pandas as pd\n",
12+
"import polars as pl"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": 2,
18+
"id": "41066023-b80e-46be-bd3d-538edc93f88e",
19+
"metadata": {},
20+
"outputs": [
21+
{
22+
"name": "stdout",
23+
"output_type": "stream",
24+
"text": [
25+
"CPU times: user 11.3 s, sys: 1.22 s, total: 12.5 s\n",
26+
"Wall time: 12.5 s\n"
27+
]
28+
}
29+
],
30+
"source": [
31+
"%time df_pandas = pd.read_csv('large_data_0001.csv', parse_dates=['timestamp',])"
32+
]
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": 3,
37+
"id": "0dae0538-b515-43eb-a6e7-3c912fdec520",
38+
"metadata": {},
39+
"outputs": [
40+
{
41+
"name": "stdout",
42+
"output_type": "stream",
43+
"text": [
44+
"CPU times: user 9.54 s, sys: 1.81 s, total: 11.4 s\n",
45+
"Wall time: 1.08 s\n"
46+
]
47+
}
48+
],
49+
"source": [
50+
"%time df_polars = pl.read_csv('large_data_0001.csv', try_parse_dates=True)"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": 8,
56+
"id": "c2e3e5b5-5433-439a-be0e-892444a08200",
57+
"metadata": {
58+
"scrolled": true
59+
},
60+
"outputs": [
61+
{
62+
"name": "stdout",
63+
"output_type": "stream",
64+
"text": [
65+
"689 ms ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
66+
]
67+
}
68+
],
69+
"source": [
70+
"%timeit days_pandas = df_pandas.groupby(df_pandas.timestamp.dt.day).mean()"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 9,
76+
"id": "85de86a0-3aac-41f0-9c8e-12cef85d350c",
77+
"metadata": {},
78+
"outputs": [
79+
{
80+
"name": "stdout",
81+
"output_type": "stream",
82+
"text": [
83+
"143 ms ± 8.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
84+
]
85+
}
86+
],
87+
"source": [
88+
"%timeit days_polars = df_polars.group_by_dynamic('timestamp', every='1d').agg(pl.exclude('timestamp').mean())"
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": 12,
94+
"id": "b6ea2c6d-b63b-4413-b455-445446bc8a54",
95+
"metadata": {},
96+
"outputs": [
97+
{
98+
"name": "stdout",
99+
"output_type": "stream",
100+
"text": [
101+
"308 ms ± 37.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
102+
]
103+
}
104+
],
105+
"source": [
106+
"%timeit df_pandas['avg'] = df_pandas[[f'C{i}' for i in range(1, 101)]].sum(axis=1)"
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": 15,
112+
"id": "c7f0452c-bd57-4054-b0e0-d8bcfed164d7",
113+
"metadata": {},
114+
"outputs": [
115+
{
116+
"data": {
117+
"text/html": [
118+
"<div><style>\n",
119+
".dataframe > thead > tr,\n",
120+
".dataframe > tbody > tr {\n",
121+
" text-align: right;\n",
122+
" white-space: pre-wrap;\n",
123+
"}\n",
124+
"</style>\n",
125+
"<small>shape: (788_323, 1)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>avg</th></tr><tr><td>f64</td></tr></thead><tbody><tr><td>100.0</td></tr><tr><td>99.984866</td></tr><tr><td>99.94813</td></tr><tr><td>99.905195</td></tr><tr><td>99.953866</td></tr><tr><td>&hellip;</td></tr><tr><td>79.357997</td></tr><tr><td>79.347877</td></tr><tr><td>79.308389</td></tr><tr><td>79.302168</td></tr><tr><td>79.262233</td></tr></tbody></table></div>"
126+
],
127+
"text/plain": [
128+
"shape: (788_323, 1)\n",
129+
"┌───────────┐\n",
130+
"│ avg │\n",
131+
"│ --- │\n",
132+
"│ f64 │\n",
133+
"╞═══════════╡\n",
134+
"│ 100.0 │\n",
135+
"│ 99.984866 │\n",
136+
"│ 99.94813 │\n",
137+
"│ 99.905195 │\n",
138+
"│ 99.953866 │\n",
139+
"│ … │\n",
140+
"│ 79.357997 │\n",
141+
"│ 79.347877 │\n",
142+
"│ 79.308389 │\n",
143+
"│ 79.302168 │\n",
144+
"│ 79.262233 │\n",
145+
"└───────────┘"
146+
]
147+
},
148+
"execution_count": 15,
149+
"metadata": {},
150+
"output_type": "execute_result"
151+
}
152+
],
153+
"source": [
154+
"df_polars.select(pl.sum_horizontal(pl.exclude('timestamp')).alias('avg'))"
155+
]
156+
},
157+
{
158+
"cell_type": "code",
159+
"execution_count": null,
160+
"id": "679aa3bb-5bf8-47db-b421-8a316883767c",
161+
"metadata": {},
162+
"outputs": [],
163+
"source": []
164+
}
165+
],
166+
"metadata": {
167+
"kernelspec": {
168+
"display_name": "Python 3 (ipykernel)",
169+
"language": "python",
170+
"name": "python3"
171+
},
172+
"language_info": {
173+
"codemirror_mode": {
174+
"name": "ipython",
175+
"version": 3
176+
},
177+
"file_extension": ".py",
178+
"mimetype": "text/x-python",
179+
"name": "python",
180+
"nbconvert_exporter": "python",
181+
"pygments_lexer": "ipython3",
182+
"version": "3.12.5"
183+
}
184+
},
185+
"nbformat": 4,
186+
"nbformat_minor": 5
187+
}

0 commit comments

Comments
 (0)