Skip to content

Commit bacb2bb

Browse files
committed
chore: add num_proc argument to Dataset.to_sql
1 parent 27c2e70 commit bacb2bb

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

src/datasets/arrow_dataset.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5282,6 +5282,7 @@ def to_sql(
52825282
name: str,
52835283
con: Union[str, "sqlalchemy.engine.Connection", "sqlalchemy.engine.Engine", "sqlite3.Connection"],
52845284
batch_size: Optional[int] = None,
5285+
num_proc: Optional[int] = None,
52855286
**sql_writer_kwargs,
52865287
) -> int:
52875288
"""Exports the dataset to a SQL database.
@@ -5294,6 +5295,11 @@ def to_sql(
52945295
batch_size (`int`, *optional*):
52955296
Size of the batch to load in memory and write at once.
52965297
Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
5298+
num_proc (`int`, *optional*):
5299+
Number of processes for multiprocessing. By default, it doesn't
5300+
use multiprocessing. `batch_size` in this case defaults to
5301+
`datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default
5302+
value if you have sufficient compute power.
52975303
**sql_writer_kwargs (additional keyword arguments):
52985304
Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html).
52995305
@@ -5324,7 +5330,7 @@ def to_sql(
53245330
# Dynamic import to avoid circular dependency
53255331
from .io.sql import SqlDatasetWriter
53265332

5327-
return SqlDatasetWriter(self, name, con, batch_size=batch_size, **sql_writer_kwargs).write()
5333+
return SqlDatasetWriter(self, name, con, batch_size=batch_size, num_proc=num_proc, **sql_writer_kwargs).write()
53285334

53295335
def _estimate_nbytes(self) -> int:
53305336
dataset_nbytes = self.data.nbytes

0 commit comments

Comments
 (0)