Skip to content

Commit 518b4f1

Browse files
authored
Fix pyarrow and dask parquet issues (#92)
* Create test data files using pyarrow 5.0.0 and dask 2021.7.2 * Fix pyarrow 5.0.0 warnings * Remove metadata access warnings pyarrow 5.0.0 * Changes to work with dask 2021.8.0 and pyarrow 5.0.0 * Changes to work with dask 2022.7.1 * Up to date, with test parquet files for pyarrow 5.0.0 and 8.0.0 * Reduce size of parquet test files and add them to MANIFEST * Fix linting * Increase test matrix * Rename license_file to license_files * Remove test warnings * Set environment channels in CI test.yaml * Improvements to build system when using pip
1 parent 91968d9 commit 518b4f1

31 files changed

+162
-87
lines changed

.github/workflows/build.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ jobs:
2828
shell: bash -l {0}
2929
env:
3030
CHANS_DEV: "-c pyviz/label/dev -c conda-forge"
31-
PKG_TEST_PYTHON: "--test-python=py37"
32-
PYTHON_VERSION: "3.7"
31+
PKG_TEST_PYTHON: "--test-python=py39"
32+
PYTHON_VERSION: "3.9"
3333
CHANS: "-c pyviz"
3434
CONDA_UPLOAD_TOKEN: ${{ secrets.CONDA_UPLOAD_TOKEN }}
3535
steps:
@@ -42,7 +42,7 @@ jobs:
4242
- uses: conda-incubator/setup-miniconda@v2
4343
with:
4444
miniconda-version: "latest"
45-
python-version: 3.8
45+
python-version: 3.9
4646
- name: Set output
4747
id: vars
4848
run: echo ::set-output name=tag::${GITHUB_REF#refs/*/}
@@ -67,8 +67,8 @@ jobs:
6767
shell: bash -l {0}
6868
env:
6969
CHANS_DEV: "-c pyviz/label/dev -c conda-forge"
70-
PKG_TEST_PYTHON: "--test-python=py37"
71-
PYTHON_VERSION: "3.7"
70+
PKG_TEST_PYTHON: "--test-python=py39"
71+
PYTHON_VERSION: "3.9"
7272
CHANS: "-c pyviz"
7373
PPU: ${{ secrets.PPU }}
7474
PPP: ${{ secrets.PPP }}

.github/workflows/test.yaml

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,25 +17,14 @@ jobs:
1717
strategy:
1818
fail-fast: false
1919
matrix:
20-
os: ['ubuntu-latest']
21-
python-version: [3.8]
22-
exclude:
23-
- os: macos-latest
24-
python-version: 3.6
25-
- os: macos-latest
26-
python-version: 3.8
27-
# Fiona for Python 3.6 on Windows has build issues
28-
# See https://github.com/conda-forge/fiona-feedstock/issues/171 for more details
29-
- os: windows-latest
30-
python-version: 3.6
20+
os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
21+
python-version: [3.7, 3.8, 3.9, '3.10']
3122
timeout-minutes: 60
3223
defaults:
3324
run:
3425
shell: bash -l {0}
3526
env:
3627
PYTHON_VERSION: ${{ matrix.python-version }}
37-
CHANS_DEV: "-c conda-forge -c pyviz/label/dev"
38-
CHANS: "-c pyviz"
3928
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
4029
steps:
4130
- uses: actions/checkout@v2
@@ -53,16 +42,18 @@ jobs:
5342
- name: conda setup
5443
run: |
5544
conda config --set always_yes True
56-
conda install -c pyviz "pyctdev>=0.5"
45+
conda install -c pyviz pyctdev
5746
doit ecosystem_setup
58-
doit env_create ${{ env.CHANS_DEV}} --python=${{ matrix.python-version }}
47+
conda create -n test-environment python=${{ matrix.python-version }}
48+
conda activate test-environment
49+
conda config --env --append channels pyviz/label/dev --append channels conda-forge
50+
conda install pyctdev
5951
- name: doit develop_install
6052
run: |
6153
eval "$(conda shell.bash hook)"
6254
conda activate test-environment
63-
conda install ${{ env.CHANS_DEV }} "pip<21.2.1"
6455
conda list
65-
doit develop_install ${{ env.CHANS_DEV }} -o tests
56+
doit develop_install -o tests
6657
- name: doit env_capture
6758
run: |
6859
eval "$(conda shell.bash hook)"

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ venv.bak/
103103
# mypy
104104
.mypy_cache/
105105

106-
*.parq
107106
.idea/
108107
spatialpandas/.version
109108

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ include NOTICE
33
include README.md
44
include CHANGELOG.md
55
include spatialpandas/.version
6+
graft spatialpandas/tests/test_data
67
global-exclude *.py[co]
78
global-exclude *~
89
global-exclude *.ipynb_checkpoints/*

setup.cfg

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
[metadata]
2-
license_file = LICENSE
2+
license_files = LICENSE
33

44
[wheel]
55
universal = 1
6+
7+
[tool:pyctdev.conda]
8+
namespace_map =
9+
geopandas=geopandas-base

setup.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,15 @@
77
'codecov',
88
'flake8',
99
'hilbertcurve',
10-
'geopandas-base',
10+
'geopandas',
1111
'hypothesis',
12-
'libstdcxx-ng >=12',
12+
'keyring',
1313
'pytest-cov',
1414
'pytest',
15+
'rfc3986',
1516
'scipy',
1617
'shapely',
1718
'twine',
18-
'rfc3986',
19-
'keyring',
2019
],
2120
'examples': [
2221
'datashader',
@@ -28,15 +27,14 @@
2827
}
2928

3029
install_requires = [
30+
'dask[complete]',
3131
'fsspec',
3232
'numba',
33-
'pandas >=0.25',
33+
'pandas',
3434
'param',
3535
'pyarrow >=1.0',
3636
'python-snappy',
3737
'retrying',
38-
'numpy',
39-
'dask[complete] >=2.0'
4038
]
4139

4240
setup_args = dict(
@@ -52,7 +50,7 @@
5250
url='https://github.com/holoviz/spatialpandas',
5351
maintainer='Datashader developers',
5452
maintainer_email='dev@datashader.org',
55-
python_requires='>=3.6',
53+
python_requires='>=3.7',
5654
install_requires=install_requires,
5755
extras_require=extras_require,
5856
tests_require=extras_require['tests'],

spatialpandas/geometry/multipolygon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def to_shapely(self):
5959
shapely MultiPolygon shape
6060
"""
6161
import shapely.geometry as sg
62-
polygon_arrays = np.asarray(self.data.as_py())
62+
polygon_arrays = np.asarray(self.data.as_py(), dtype=object)
6363

6464
polygons = []
6565
for polygon_array in polygon_arrays:

spatialpandas/geometry/polygon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def to_shapely(self):
6161
"""
6262
import shapely.geometry as sg
6363
ring_arrays = [np.asarray(line_coords).reshape(len(line_coords) // 2, 2)
64-
for line_coords in np.asarray(self.data.as_py())]
64+
for line_coords in np.asarray(self.data.as_py(), dtype=object)]
6565
rings = [sg.LinearRing(ring_array) for ring_array in ring_arrays]
6666
return sg.Polygon(shell=rings[0], holes=rings[1:])
6767

spatialpandas/io/parquet.py

Lines changed: 35 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
import copy
21
import json
32
import pathlib
4-
from distutils.version import LooseVersion
53
from functools import reduce
64
from glob import has_magic
75
from numbers import Number
6+
from packaging.version import Version
87
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
98

109
import fsspec
@@ -31,7 +30,7 @@
3130
)
3231

3332
# improve pandas compatibility, based on geopandas _compat.py
34-
PANDAS_GE_12 = str(pd.__version__) >= LooseVersion("1.2.0")
33+
PANDAS_GE_12 = Version(pd.__version__) >= Version("1.2.0")
3534

3635
_geometry_dtypes = [
3736
PointDtype, MultiPointDtype, RingDtype, LineDtype,
@@ -63,16 +62,18 @@ def _load_parquet_pandas_metadata(
6362
pqds = pq.ParquetDataset(
6463
path,
6564
filesystem=filesystem,
66-
validate_schema=False,
65+
#validate_schema=False,
66+
use_legacy_dataset=False,
6767
**engine_kwargs,
6868
)
69-
common_metadata = pqds.common_metadata
70-
if common_metadata is None:
71-
# Get metadata for first piece
72-
piece = pqds.pieces[0]
73-
metadata = piece.get_metadata().metadata
74-
else:
75-
metadata = pqds.common_metadata.metadata
69+
filename = pathlib.Path(pqds.files[0]).parent.joinpath("_common_metadata")
70+
try:
71+
common_metadata = pq.read_metadata(filename)
72+
except FileNotFoundError:
73+
# Common metadata doesn't exist, so get metadata for first piece instead
74+
filename = pathlib.Path(pqds.files[0])
75+
common_metadata = pq.read_metadata(filename)
76+
metadata = common_metadata.metadata
7677
else:
7778
with filesystem.open(path) as f:
7879
pf = pq.ParquetFile(f)
@@ -198,7 +199,7 @@ def to_parquet_dask(
198199
**kwargs: Any,
199200
) -> None:
200201
engine_kwargs = engine_kwargs or {}
201-
202+
202203
if not isinstance(ddf, DaskGeoDataFrame):
203204
raise TypeError(f"Expected DaskGeoDataFrame not {type(ddf)}")
204205
filesystem = validate_coerce_filesystem(path, filesystem, storage_options)
@@ -207,48 +208,27 @@ def to_parquet_dask(
207208
path and filesystem.isdir(path):
208209
filesystem.rm(path, recursive=True)
209210

210-
dd_to_parquet(
211-
ddf,
212-
path,
213-
engine="pyarrow",
214-
compression=compression,
215-
storage_options=storage_options,
216-
**kwargs,
217-
)
218-
219-
# Write partition bounding boxes to the _metadata file
211+
# Determine partition bounding boxes to save to _metadata file
220212
partition_bounds = {}
221213
for series_name in ddf.columns:
222214
series = ddf[series_name]
223215
if isinstance(series.dtype, GeometryDtype):
224-
if series._partition_bounds is None:
225-
# Bounds are not already computed. Compute bounds from the parquet file
226-
# that was just written.
227-
filesystem.invalidate_cache(path)
228-
series = read_parquet_dask(
229-
path,
230-
columns=[series_name],
231-
filesystem=filesystem,
232-
load_divisions=False,
233-
storage_options=storage_options,
234-
)[series_name]
235216
partition_bounds[series_name] = series.partition_bounds.to_dict()
236217

237218
spatial_metadata = {'partition_bounds': partition_bounds}
238219
b_spatial_metadata = json.dumps(spatial_metadata).encode('utf')
239220

240-
pqds = pq.ParquetDataset(
221+
dd_to_parquet(
222+
ddf,
241223
path,
242-
filesystem=filesystem,
243-
validate_schema=False,
224+
engine="pyarrow",
225+
compression=compression,
226+
storage_options=storage_options,
227+
custom_metadata={b'spatialpandas': b_spatial_metadata},
228+
write_metadata_file=True,
244229
**engine_kwargs,
230+
**kwargs,
245231
)
246-
all_metadata = copy.copy(pqds.common_metadata.metadata)
247-
all_metadata[b'spatialpandas'] = b_spatial_metadata
248-
schema = pqds.common_metadata.schema.to_arrow_schema()
249-
new_schema = schema.with_metadata(all_metadata)
250-
with filesystem.open(pqds.common_metadata_path, 'wb') as f:
251-
pq.write_metadata(new_schema, f)
252232

253233

254234
def read_parquet_dask(
@@ -293,7 +273,7 @@ def read_parquet_dask(
293273
build_sindex : boolean
294274
Whether to build partition level spatial indexes to speed up indexing.
295275
storage_options: Key/value pairs to be passed on to the file-system backend, if any.
296-
engine_kwargs: pyarrow.parquet engine-related keyword arguments.
276+
engine_kwargs: pyarrow.parquet engine-related keyword arguments.
297277
Returns:
298278
DaskGeoDataFrame
299279
"""
@@ -357,7 +337,8 @@ def _perform_read_parquet_dask(
357337
pa.parquet.ParquetDataset(
358338
path,
359339
filesystem=filesystem,
360-
validate_schema=False,
340+
#validate_schema=False,
341+
use_legacy_dataset=False,
361342
**engine_kwargs,
362343
) for path in paths
363344
]
@@ -366,7 +347,7 @@ def _perform_read_parquet_dask(
366347
pieces = []
367348
for dataset in datasets:
368349
# Perform natural sort on pieces so that "part.10" comes after "part.2"
369-
dataset_pieces = sorted(dataset.pieces, key=lambda piece: natural_sort_key(piece.path))
350+
dataset_pieces = sorted(dataset.fragments, key=lambda piece: natural_sort_key(piece.path))
370351
pieces.extend(dataset_pieces)
371352

372353
delayed_partitions = [
@@ -419,7 +400,7 @@ def _perform_read_parquet_dask(
419400
cols_no_index = None
420401

421402
meta = dd_read_parquet(
422-
paths[0],
403+
datasets[0].files[0],
423404
columns=cols_no_index,
424405
filesystem=filesystem,
425406
engine='pyarrow',
@@ -514,10 +495,15 @@ def _perform_read_parquet_dask(
514495

515496
def _load_partition_bounds(pqds):
516497
partition_bounds = None
517-
if (pqds.common_metadata is not None and
518-
b'spatialpandas' in pqds.common_metadata.metadata):
498+
filename = pathlib.Path(pqds.files[0]).parent.joinpath("_common_metadata")
499+
try:
500+
common_metadata = pq.read_metadata(filename)
501+
except FileNotFoundError:
502+
common_metadata = None
503+
504+
if common_metadata is not None and b'spatialpandas' in common_metadata.metadata:
519505
spatial_metadata = json.loads(
520-
pqds.common_metadata.metadata[b'spatialpandas'].decode('utf')
506+
common_metadata.metadata[b'spatialpandas'].decode('utf')
521507
)
522508
if "partition_bounds" in spatial_metadata:
523509
partition_bounds = {}

0 commit comments

Comments
 (0)