Add ability to pass storage_options to pandas.to_parquet. Update docs/CHANGELOG (#79)

iameskild · brl0 · jbednar · web-flow · commit db730b63b748 · 2021-08-06T17:13:20.000-05:00
* build sindex takes args, returns self, cleanup

* Pass storage and engine options, add some typing

* Pass kwargs to _load_parquet_pandas_metadata

* Remove unused import in tests

* Fix argument order for pack_partitions_to_parquet

* Pass storage_options to read_parquet_dask

* Update docstrings for pack_partitions_to_parquet

* Update docstrings for validate_coerce_filesystem

* Pass storage_options to read_parquet_dask

* Update docstrings for read_parquet_dask

* Update CHANGELOG

* Add overwrite option, minor clean up

* Update CHANGELOG

* Implement version dependent pandas to_parquet function

* Update CHANGELOG

* Pass storage_options to read_parquet_dask

* Add pd_to_parquet to __init__

* Upate import statement

* Move import statement

* Minor typing change

* Add __all__, minor cleanup

* Update to_parquet function call

* Update CHANGELOG

* Add engine_kwargs to pd.to_parquet call

Co-authored-by: Brian Larsen &lt;B_R_L@hotmail.com&gt;

* Add engine_kwargs to dask.to_parquet

Co-authored-by: Brian Larsen &lt;B_R_L@hotmail.com&gt;

* Add storage_options to read_parquet

Co-authored-by: Brian Larsen &lt;B_R_L@hotmail.com&gt;

* Add engine_kwargs to read_parquet

Co-authored-by: Brian Larsen &lt;B_R_L@hotmail.com&gt;

* Modify engine_kwargs to pack_partitions_to_parquet

* Update CHANGELOG.md

Co-authored-by: James A. Bednar &lt;jbednar@users.noreply.github.com&gt;

* Update CHANGELOG.md

Co-authored-by: James A. Bednar &lt;jbednar@users.noreply.github.com&gt;

* Fix missing final newline

Co-authored-by: James A. Bednar &lt;jbednar@users.noreply.github.com&gt;

* Modify how fs is handled in to_parquet

* Remove extra whitespace

* Fix return statement indent

* Remove extra whitespace

Co-authored-by: Brian Larsen &lt;B_R_L@hotmail.com&gt;

* Merge holoviz/spatialpandas master into fix/dask_parquet

* Update engine_kwargs formatting

* Move PANDAS_GE_12 into io/parquet.py

Co-authored-by: Brian Larsen &lt;B_R_L@hotmail.com&gt;
Co-authored-by: James A. Bednar &lt;jbednar@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## Version 0.4.3
+
+Date: 2021-08-05
+
+This release primarily expands the optional arguments that can be passed to `to_parquet_dask`/`read_parquet_dask` ensuring that `storage_options` is successfully passed where needed. It also adds the ability to pass `storage_options` to the `pandas.to_parquet` function (only for pandas > 1.2) and renames any reference to `fname` with `path` to align with the pandas convention.
+
+Bug fixes:
+- Update `validate_coerce_filesystem` to pass `storage_options` through. ([#78](https://github.com/holoviz/spatialpandas/pull/78))
+
+
 ## Version 0.4.2
 
 This release primarily achieves compatibility with recent releases of Pandas. Many thanks to @Hoxbro for contributing the fixes and @philippjfr for ongoing maintenance of the project.
diff --git a/spatialpandas/dask.py b/spatialpandas/dask.py
@@ -237,12 +237,16 @@ def pack_partitions_to_parquet(
 
                 These directories are deleted as soon as possible during the execution
                 of the function.
+            storage_options: Key/value pairs to be passed on to the file-system backend, if any.
+            engine_kwargs: pyarrow.parquet engine-related keyword arguments.
         Returns:
             DaskGeoDataFrame backed by newly written parquet dataset
         """
         from .io import read_parquet, read_parquet_dask
         from .io.utils import validate_coerce_filesystem
 
+        engine_kwargs = engine_kwargs or {}
+
         # Get fsspec filesystem object
         filesystem = validate_coerce_filesystem(path, filesystem, storage_options)
 
@@ -346,9 +350,10 @@ def write_partition(df_part, part_path):
                     f,
                     compression=compression,
                     index=True,
-                    **(engine_kwargs or {}),
+                    **engine_kwargs,
                 )
 
+
         def process_partition(df, i):
             subpart_paths = {}
             for out_partition, df_part in df.groupby('_partition'):
@@ -394,7 +399,12 @@ def read_parquet_retry(parts_tmp_path, subpart_paths, part_output_path):
                 # Handle rare case where the task was resubmitted and the work has
                 # already been done.  This shouldn't happen with pure=False, but it
                 # seems like it does very rarely.
-                return read_parquet(part_output_path, filesystem=filesystem)
+                return read_parquet(
+                    part_output_path,
+                    filesystem=filesystem,
+                    storage_options=storage_options,
+                    **engine_kwargs,
+                )
 
             ls_res = sorted(filesystem.ls(parts_tmp_path, **ls_kwargs))
             subpart_paths_stripped = sorted([filesystem._strip_protocol(_) for _ in subpart_paths])
@@ -414,7 +424,12 @@ def read_parquet_retry(parts_tmp_path, subpart_paths, part_output_path):
                         extras=list(extras)
                     )
                 )
-            return read_parquet(parts_tmp_path, filesystem=filesystem)
+            return read_parquet(
+                parts_tmp_path,
+                filesystem=filesystem,
+                storage_options=storage_options,
+                **engine_kwargs,
+            )
 
         def concat_parts(parts_tmp_path, subpart_paths, part_output_path):
             filesystem.invalidate_cache()
@@ -512,7 +527,12 @@ def write_commonmetadata_file():
                 pq.write_metadata(new_schema, f)
         write_commonmetadata_file()
 
-        return read_parquet_dask(path, filesystem=filesystem)
+        return read_parquet_dask(
+            path,
+            filesystem=filesystem,
+            storage_options=storage_options,
+            engine_kwargs=engine_kwargs,
+        )
 
     def _compute_packing_npartitions(self, npartitions):
         if npartitions is None:
diff --git a/spatialpandas/io/parquet.py b/spatialpandas/io/parquet.py
@@ -1,6 +1,7 @@
 import copy
 import json
 import pathlib
+from distutils.version import LooseVersion
 from functools import reduce
 from glob import has_magic
 from numbers import Number
@@ -29,6 +30,9 @@
     validate_coerce_filesystem,
 )
 
+# improve pandas compatibility, based on geopandas _compat.py
+PANDAS_GE_12 = str(pd.__version__) >= LooseVersion("1.2.0")
+
 _geometry_dtypes = [
     PointDtype, MultiPointDtype, RingDtype, LineDtype,
     MultiLineDtype, PolygonDtype, MultiPolygonDtype
@@ -50,6 +54,7 @@ def _load_parquet_pandas_metadata(
     storage_options=None,
     engine_kwargs=None,
 ):
+    engine_kwargs = engine_kwargs or {}
     filesystem = validate_coerce_filesystem(path, filesystem, storage_options)
     if not filesystem.exists(path):
         raise ValueError("Path not found: " + path)
@@ -59,7 +64,7 @@ def _load_parquet_pandas_metadata(
             path,
             filesystem=filesystem,
             validate_schema=False,
-            **(engine_kwargs or {}),
+            **engine_kwargs,
         )
         common_metadata = pqds.common_metadata
         if common_metadata is None:
@@ -98,20 +103,35 @@ def _get_geometry_columns(pandas_metadata):
 
 def to_parquet(
     df: GeoDataFrame,
-    fname: PathType,
+    path: PathType,
     compression: Optional[str] = "snappy",
+    filesystem: Optional[fsspec.AbstractFileSystem] = None,
     index: Optional[bool] = None,
+    storage_options: Optional[Dict[str, Any]] = None,
     **kwargs: Any,
 ) -> None:
+    if filesystem is not None:
+        filesystem = validate_coerce_filesystem(path, filesystem, storage_options)
+
     # Standard pandas to_parquet with pyarrow engine
-    pd_to_parquet(
-        df,
-        fname,
-        engine="pyarrow",
-        compression=compression,
-        index=index,
+    to_parquet_args = {
+        "df": df,
+        "path": path,
+        "engine": "pyarrow",
+        "compression": compression,
+        "filesystem": filesystem,
+        "index": index,
         **kwargs,
-    )
+    }
+
+    if PANDAS_GE_12:
+        to_parquet_args.update({"storage_options": storage_options})
+    else:
+        if filesystem is None:
+            filesystem = validate_coerce_filesystem(path, filesystem, storage_options)
+            to_parquet_args.update({"filesystem": filesystem})
+
+    pd_to_parquet(**to_parquet_args)
 
 
 def read_parquet(
@@ -122,6 +142,7 @@ def read_parquet(
     engine_kwargs: Optional[Dict[str, Any]] = None,
     **kwargs: Any,
 ) -> GeoDataFrame:
+    engine_kwargs = engine_kwargs or {}
     filesystem = validate_coerce_filesystem(path, filesystem, storage_options)
 
     # Load pandas parquet metadata
@@ -154,7 +175,7 @@ def read_parquet(
         path,
         filesystem=filesystem,
         validate_schema=False,
-        **(engine_kwargs or {}),
+        **engine_kwargs,
         **kwargs,
     ).read(columns=columns).to_pandas()
 
@@ -176,6 +197,8 @@ def to_parquet_dask(
     engine_kwargs: Optional[Dict[str, Any]] = None,
     **kwargs: Any,
 ) -> None:
+    engine_kwargs = engine_kwargs or {}
+    
     if not isinstance(ddf, DaskGeoDataFrame):
         raise TypeError(f"Expected DaskGeoDataFrame not {type(ddf)}")
     filesystem = validate_coerce_filesystem(path, filesystem, storage_options)
@@ -207,6 +230,7 @@ def to_parquet_dask(
                     columns=[series_name],
                     filesystem=filesystem,
                     load_divisions=False,
+                    storage_options=storage_options,
                 )[series_name]
             partition_bounds[series_name] = series.partition_bounds.to_dict()
 
@@ -217,7 +241,7 @@ def to_parquet_dask(
         path,
         filesystem=filesystem,
         validate_schema=False,
-        **(engine_kwargs or {}),
+        **engine_kwargs,
     )
     all_metadata = copy.copy(pqds.common_metadata.metadata)
     all_metadata[b'spatialpandas'] = b_spatial_metadata
@@ -268,6 +292,8 @@ def read_parquet_dask(
             data written by dask/fastparquet, not otherwise.
         build_sindex : boolean
             Whether to build partition level spatial indexes to speed up indexing.
+        storage_options: Key/value pairs to be passed on to the file-system backend, if any.
+        engine_kwargs: pyarrow.parquet engine-related keyword arguments. 
     Returns:
     DaskGeoDataFrame
     """
@@ -321,12 +347,12 @@ def _perform_read_parquet_dask(
     storage_options=None,
     engine_kwargs=None,
 ):
+    engine_kwargs = engine_kwargs or {}
     filesystem = validate_coerce_filesystem(
         paths[0],
         filesystem,
         storage_options,
     )
-    engine_kwargs = engine_kwargs or {}
     datasets = [
         pa.parquet.ParquetDataset(
             path,
diff --git a/spatialpandas/io/utils.py b/spatialpandas/io/utils.py
@@ -20,6 +20,7 @@ def validate_coerce_filesystem(
         path: Path as a string
         filesystem: Optional fsspec filesystem object to use to open the file. If not
             provided, filesystem type is inferred from path
+        storage_options: Key/value pairs to be passed on to the file-system backend, if any.
 
     Returns:
         fsspec file system