Add pandas versus polars benchmark

gjbex · gjbex · commit 434ba32f901c · 2024-09-10T16:21:05.000+02:00
diff --git a/source-code/polars/README.md b/source-code/polars/README.md
@@ -8,4 +8,6 @@ Polars is an alternative to pandas that is designed to have better performance.
 1. `patient_data.ipynb`: Jupyter notebook that explores functional differences
    between pandas and polars.  It replicates the notebook in the `pandas`
    directory with the same name.
+1. `polars_versus_pandas_benchmarks.ipynb`: Jupyter notebook that compares the
+   performance of polars and pandas on a variety of operations.
 1. `data`: Directory containing the data used in the notebook.
diff --git a/source-code/polars/polars_versus_pandas_benchmarks.ipynb b/source-code/polars/polars_versus_pandas_benchmarks.ipynb
@@ -0,0 +1,187 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c9f1da30-f441-4aaa-b91c-c1b6cf3b934d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import polars as pl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "41066023-b80e-46be-bd3d-538edc93f88e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 11.3 s, sys: 1.22 s, total: 12.5 s\n",
+      "Wall time: 12.5 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%time df_pandas = pd.read_csv('large_data_0001.csv', parse_dates=['timestamp',])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0dae0538-b515-43eb-a6e7-3c912fdec520",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 9.54 s, sys: 1.81 s, total: 11.4 s\n",
+      "Wall time: 1.08 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%time df_polars = pl.read_csv('large_data_0001.csv', try_parse_dates=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c2e3e5b5-5433-439a-be0e-892444a08200",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "689 ms ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit days_pandas = df_pandas.groupby(df_pandas.timestamp.dt.day).mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "85de86a0-3aac-41f0-9c8e-12cef85d350c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "143 ms ± 8.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit days_polars = df_polars.group_by_dynamic('timestamp', every='1d').agg(pl.exclude('timestamp').mean())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "b6ea2c6d-b63b-4413-b455-445446bc8a54",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "308 ms ± 37.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit df_pandas['avg'] = df_pandas[[f'C{i}' for i in range(1, 101)]].sum(axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c7f0452c-bd57-4054-b0e0-d8bcfed164d7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div><style>\n",
+       ".dataframe > thead > tr,\n",
+       ".dataframe > tbody > tr {\n",
+       "  text-align: right;\n",
+       "  white-space: pre-wrap;\n",
+       "}\n",
+       "</style>\n",
+       "<small>shape: (788_323, 1)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>avg</th></tr><tr><td>f64</td></tr></thead><tbody><tr><td>100.0</td></tr><tr><td>99.984866</td></tr><tr><td>99.94813</td></tr><tr><td>99.905195</td></tr><tr><td>99.953866</td></tr><tr><td>&hellip;</td></tr><tr><td>79.357997</td></tr><tr><td>79.347877</td></tr><tr><td>79.308389</td></tr><tr><td>79.302168</td></tr><tr><td>79.262233</td></tr></tbody></table></div>"
+      ],
+      "text/plain": [
+       "shape: (788_323, 1)\n",
+       "┌───────────┐\n",
+       "│ avg       │\n",
+       "│ ---       │\n",
+       "│ f64       │\n",
+       "╞═══════════╡\n",
+       "│ 100.0     │\n",
+       "│ 99.984866 │\n",
+       "│ 99.94813  │\n",
+       "│ 99.905195 │\n",
+       "│ 99.953866 │\n",
+       "│ …         │\n",
+       "│ 79.357997 │\n",
+       "│ 79.347877 │\n",
+       "│ 79.308389 │\n",
+       "│ 79.302168 │\n",
+       "│ 79.262233 │\n",
+       "└───────────┘"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_polars.select(pl.sum_horizontal(pl.exclude('timestamp')).alias('avg'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "679aa3bb-5bf8-47db-b421-8a316883767c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}