Skip to content

Commit ee4fac1

Browse files
authored
Merge branch 'main' into fix/not-json-serializable
2 parents 28c2165 + ebe0a3c commit ee4fac1

34 files changed

+3260
-2196
lines changed

.github/workflows/python-ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ jobs:
7575

7676
steps:
7777
- uses: actions/checkout@v5
78+
- uses: actions/setup-python@v6
79+
with:
80+
python-version: ${{ matrix.python }}
7881
- name: Install system dependencies
7982
run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos
8083
- name: Install

.github/workflows/stale.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
if: github.repository_owner == 'apache'
3232
runs-on: ubuntu-latest
3333
steps:
34-
- uses: actions/stale@v10.0.0
34+
- uses: actions/stale@v10.1.0
3535
with:
3636
stale-issue-label: 'stale'
3737
exempt-issue-labels: 'not-stale'

.pre-commit-config.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,12 @@ repos:
3737
hooks:
3838
- id: mypy
3939
args:
40-
[--install-types, --non-interactive, --config=pyproject.toml]
40+
[--config=pyproject.toml]
41+
additional_dependencies:
42+
- types-cachetools
43+
- types-pytest-lazy-fixture
44+
- types-pytz
45+
- types-requests
4146
- repo: https://github.com/igorshubovych/markdownlint-cli
4247
rev: v0.45.0
4348
hooks:

Makefile

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ install: install-poetry install-dependencies ## Install Poetry and dependencies
8383
check-license: ## Check license headers
8484
./dev/check-license
8585

86-
lint: ## Run code linters via pre-commit
87-
$(POETRY) run pre-commit run --all-files
86+
lint: ## Run code linters via prek (pre-commit hooks)
87+
$(POETRY) run prek run -a
8888

8989
# ===============
9090
# Testing Section
@@ -100,10 +100,8 @@ test-integration: test-integration-setup test-integration-exec test-integration-
100100
test-integration-setup: ## Start Docker services for integration tests
101101
docker compose -f dev/docker-compose-integration.yml kill
102102
docker compose -f dev/docker-compose-integration.yml rm -f
103-
docker compose -f dev/docker-compose-integration.yml up -d
104-
sleep 10
105-
docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py
106-
docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
103+
docker compose -f dev/docker-compose-integration.yml up -d --wait
104+
$(POETRY) run python dev/provision.py
107105

108106
test-integration-exec: ## Run integration tests (excluding provision)
109107
$(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS)

dev/Dockerfile

Lines changed: 0 additions & 98 deletions
This file was deleted.

dev/docker-compose-integration.yml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@
1717

1818
services:
1919
spark-iceberg:
20-
image: python-integration
2120
container_name: pyiceberg-spark
22-
build: .
21+
build: spark/
2322
networks:
2423
iceberg_net:
2524
depends_on:
@@ -37,6 +36,12 @@ services:
3736
- rest:rest
3837
- hive:hive
3938
- minio:minio
39+
healthcheck:
40+
test: ["CMD", "sh", "-c", "netstat -an | grep 15002 | grep LISTEN"]
41+
interval: 30s
42+
timeout: 10s
43+
retries: 5
44+
start_period: 90s
4045
rest:
4146
image: apache/iceberg-rest-fixture
4247
container_name: pyiceberg-rest
@@ -87,7 +92,7 @@ services:
8792
"
8893
hive:
8994
build: hive/
90-
container_name: hive
95+
container_name: pyiceberg-hive
9196
hostname: hive
9297
networks:
9398
iceberg_net:

dev/entrypoint.sh

Lines changed: 0 additions & 23 deletions
This file was deleted.

dev/provision.py

Lines changed: 21 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17-
import math
1817

1918
from pyspark.sql import SparkSession
2019
from pyspark.sql.functions import current_date, date_add, expr
@@ -23,35 +22,26 @@
2322
from pyiceberg.schema import Schema
2423
from pyiceberg.types import FixedType, NestedField, UUIDType
2524

26-
# The configuration is important, otherwise we get many small
27-
# parquet files with a single row. When a positional delete
28-
# hits the Parquet file with one row, the parquet file gets
29-
# dropped instead of having a merge-on-read delete file.
30-
spark = (
31-
SparkSession
32-
.builder
33-
.config("spark.sql.shuffle.partitions", "1")
34-
.config("spark.default.parallelism", "1")
35-
.getOrCreate()
36-
)
25+
# Create SparkSession against the remote Spark Connect server
26+
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
3727

3828
catalogs = {
39-
'rest': load_catalog(
29+
"rest": load_catalog(
4030
"rest",
4131
**{
4232
"type": "rest",
43-
"uri": "http://rest:8181",
44-
"s3.endpoint": "http://minio:9000",
33+
"uri": "http://localhost:8181",
34+
"s3.endpoint": "http://localhost:9000",
4535
"s3.access-key-id": "admin",
4636
"s3.secret-access-key": "password",
4737
},
4838
),
49-
'hive': load_catalog(
39+
"hive": load_catalog(
5040
"hive",
5141
**{
5242
"type": "hive",
53-
"uri": "thrift://hive:9083",
54-
"s3.endpoint": "http://minio:9000",
43+
"uri": "thrift://localhost:9083",
44+
"s3.endpoint": "http://localhost:9000",
5545
"s3.access-key-id": "admin",
5646
"s3.secret-access-key": "password",
5747
},
@@ -119,7 +109,7 @@
119109
# v3: Using deletion vectors
120110

121111
for format_version in [2, 3]:
122-
identifier = f'{catalog_name}.default.test_positional_mor_deletes_v{format_version}'
112+
identifier = f"{catalog_name}.default.test_positional_mor_deletes_v{format_version}"
123113
spark.sql(
124114
f"""
125115
CREATE OR REPLACE TABLE {identifier} (
@@ -137,10 +127,8 @@
137127
"""
138128
)
139129

140-
spark.sql(
141-
f"""
142-
INSERT INTO {identifier}
143-
VALUES
130+
spark.sql("""
131+
SELECT * FROM VALUES
144132
(CAST('2023-03-01' AS date), 1, 'a'),
145133
(CAST('2023-03-02' AS date), 2, 'b'),
146134
(CAST('2023-03-03' AS date), 3, 'c'),
@@ -152,9 +140,9 @@
152140
(CAST('2023-03-09' AS date), 9, 'i'),
153141
(CAST('2023-03-10' AS date), 10, 'j'),
154142
(CAST('2023-03-11' AS date), 11, 'k'),
155-
(CAST('2023-03-12' AS date), 12, 'l');
156-
"""
157-
)
143+
(CAST('2023-03-12' AS date), 12, 'l')
144+
AS t(dt, number, letter)
145+
""").coalesce(1).writeTo(identifier).append()
158146

159147
spark.sql(f"ALTER TABLE {identifier} CREATE TAG tag_12")
160148

@@ -164,7 +152,7 @@
164152

165153
spark.sql(f"DELETE FROM {identifier} WHERE number = 9")
166154

167-
identifier = f'{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}'
155+
identifier = f"{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}"
168156

169157
spark.sql(
170158
f"""
@@ -178,15 +166,13 @@
178166
'write.delete.mode'='merge-on-read',
179167
'write.update.mode'='merge-on-read',
180168
'write.merge.mode'='merge-on-read',
181-
'format-version'='2'
169+
'format-version'='{format_version}'
182170
);
183171
"""
184172
)
185173

186-
spark.sql(
187-
f"""
188-
INSERT INTO {identifier}
189-
VALUES
174+
spark.sql("""
175+
SELECT * FROM VALUES
190176
(CAST('2023-03-01' AS date), 1, 'a'),
191177
(CAST('2023-03-02' AS date), 2, 'b'),
192178
(CAST('2023-03-03' AS date), 3, 'c'),
@@ -198,9 +184,9 @@
198184
(CAST('2023-03-09' AS date), 9, 'i'),
199185
(CAST('2023-03-10' AS date), 10, 'j'),
200186
(CAST('2023-03-11' AS date), 11, 'k'),
201-
(CAST('2023-03-12' AS date), 12, 'l');
202-
"""
203-
)
187+
(CAST('2023-03-12' AS date), 12, 'l')
188+
AS t(dt, number, letter)
189+
""").coalesce(1).writeTo(identifier).append()
204190

205191
# Perform two deletes, should produce:
206192
# v2: two positional delete files in v2

0 commit comments

Comments
 (0)