Skip to content

Commit 23e6a1d

Browse files
committed
address comments
2 parents b4332b7 + a3e55e5 commit 23e6a1d

File tree

173 files changed

+7409
-1429
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

173 files changed

+7409
-1429
lines changed

.github/workflows/build_and_test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ jobs:
103103
ui=true
104104
docs=true
105105
else
106+
pyspark_install=false
106107
pandas=false
107108
yarn=false
108109
kubernetes=false

.github/workflows/build_python_3.11_macos.yml

Lines changed: 0 additions & 33 deletions
This file was deleted.

.github/workflows/build_python_connect35.yml renamed to .github/workflows/build_python_connect40.yml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
# under the License.
1818
#
1919

20-
name: Build / Python-only, Connect-only (master-server, branch-3.5-client, Python 3.11)
20+
name: Build / Python-only, Connect-only (master-server, branch-4.0-client, Python 3.11)
2121

2222
on:
2323
schedule:
24-
- cron: '0 21 * * *'
24+
- cron: '0 20 * * *'
2525
workflow_dispatch:
2626

2727
jobs:
@@ -68,10 +68,10 @@ jobs:
6868
./build/sbt -Phive Test/package
6969
- name: Install Python dependencies
7070
run: |
71-
pip install 'numpy==1.25.1' 'pyarrow>=18.0.0' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
71+
pip install 'numpy' 'pyarrow>=18.0.0' 'pandas==2.2.3' scipy unittest-xml-reporting 'plotly<6.0.0' 'mlflow>=2.8.1' coverage 'matplotlib' openpyxl 'memory-profiler==0.61.0' 'scikit-learn>=1.3.2'
7272
7373
# Add Python deps for Spark Connect.
74-
pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.5' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3'
74+
pip install 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.0' 'googleapis-common-protos==1.71.0' 'graphviz==0.20.3'
7575
7676
# Add torch as a testing dependency for TorchDistributor
7777
pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval
@@ -91,15 +91,16 @@ jobs:
9191
PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
9292
--driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
9393
--jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" \
94-
--conf spark.sql.execution.arrow.pyspark.validateSchema.enabled=false
94+
--conf spark.sql.execution.arrow.pyspark.validateSchema.enabled=false \
95+
--conf spark.sql.execution.pandas.convertToArrowArraySafely=false
9596
96-
# Checkout to branch-3.5 to use the tests in branch-3.5.
97+
# Checkout to branch-4.0 to use the tests in branch-4.0.
9798
cd ..
98-
git clone --single-branch --branch branch-3.5 $GITHUB_SERVER_URL/$GITHUB_REPOSITORY spark-3.5
99-
cd spark-3.5
99+
git clone --single-branch --branch branch-4.0 $GITHUB_SERVER_URL/$GITHUB_REPOSITORY spark-4.0
100+
cd spark-4.0
100101
101102
# Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
102-
# Run branch-3.5 tests
103+
# Run branch-4.0 tests
103104
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect
104105
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
105106
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect

.github/workflows/maven_test.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ jobs:
5656
build:
5757
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}"
5858
runs-on: ${{ inputs.os }}
59-
timeout-minutes: 150
59+
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
60+
# timeout-minutes: 150
6061
strategy:
6162
fail-fast: false
6263
matrix:
@@ -143,6 +144,8 @@ jobs:
143144
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
144145
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
145146
- name: Cache SBT and Maven
147+
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
148+
if: ${{ runner.os != 'macOS' }}
146149
uses: actions/cache@v4
147150
with:
148151
path: |
@@ -153,6 +156,8 @@ jobs:
153156
restore-keys: |
154157
build-
155158
- name: Cache Maven local repository
159+
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
160+
if: ${{ runner.os != 'macOS' }}
156161
uses: actions/cache@v4
157162
with:
158163
path: ~/.m2/repository

.github/workflows/python_hosted_runner_test.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ jobs:
5959
build:
6060
name: "Build modules: ${{ matrix.modules }}"
6161
runs-on: ${{ inputs.os }}
62-
timeout-minutes: 150
62+
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
63+
# timeout-minutes: 150
6364
strategy:
6465
fail-fast: false
6566
matrix:
@@ -117,6 +118,8 @@ jobs:
117118
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
118119
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
119120
- name: Cache SBT and Maven
121+
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
122+
if: ${{ runner.os != 'macOS' }}
120123
uses: actions/cache@v4
121124
with:
122125
path: |
@@ -127,6 +130,8 @@ jobs:
127130
restore-keys: |
128131
build-
129132
- name: Cache Coursier local repository
133+
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
134+
if: ${{ runner.os != 'macOS' }}
130135
uses: actions/cache@v4
131136
with:
132137
path: ~/.cache/coursier

.github/workflows/test_report.yml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,19 @@ jobs:
2828
test_report:
2929
if: github.event.workflow_run.conclusion != 'skipped'
3030
runs-on: ubuntu-latest
31+
permissions:
32+
actions: read
33+
checks: write
34+
contents: read
3135
steps:
3236
- name: Download test results to report
33-
uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # pin @v6
37+
uses: actions/download-artifact@v5
3438
with:
35-
github_token: ${{ secrets.GITHUB_TOKEN }}
36-
workflow: ${{ github.event.workflow_run.workflow_id }}
37-
commit: ${{ github.event.workflow_run.head_commit.id }}
38-
workflow_conclusion: completed
39+
github-token: ${{ secrets.GITHUB_TOKEN }}
40+
run-id: ${{ github.event.workflow_run.id }}
41+
pattern: "test-*"
3942
- name: Publish test report
40-
uses: scacap/action-surefire-report@a2911bd1a4412ec18dde2d93b1758b3e56d2a880 # pin @v1.8.0
43+
uses: scacap/action-surefire-report@v1
4144
with:
4245
check_name: Report test results
4346
github_token: ${{ secrets.GITHUB_TOKEN }}

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ This README file only contains basic setup instructions.
3636
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_maven_java21_arm.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_maven_java21_arm.yml) |
3737
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_coverage.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_coverage.yml) |
3838
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_pypy3.10.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_pypy3.10.yml) |
39+
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_pypy3.11.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_pypy3.11.yml) |
3940
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_3.10.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_3.10.yml) |
4041
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_3.11_classic_only.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_3.11_classic_only.yml) |
4142
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_3.11_arm.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_3.11_arm.yml) |
42-
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_3.11_macos.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_3.11_macos.yml) |
4343
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_3.11_macos26.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_3.11_macos26.yml) |
4444
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_numpy_2.1.3.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_numpy_2.1.3.yml) |
4545
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_3.12.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_3.12.yml) |
@@ -48,7 +48,7 @@ This README file only contains basic setup instructions.
4848
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_3.14.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_3.14.yml) |
4949
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_minimum.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_minimum.yml) |
5050
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_ps_minimum.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_ps_minimum.yml) |
51-
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_connect35.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_connect35.yml) |
51+
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_connect40.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_connect40.yml) |
5252
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_python_connect.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_python_connect.yml) |
5353
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_sparkr_window.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_sparkr_window.yml) |
5454
| | [![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/publish_snapshot.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/publish_snapshot.yml) |

connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import org.apache.avro.io.{DecoderFactory, EncoderFactory}
2929
import org.apache.spark.SparkException
3030
import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
3131
import org.apache.spark.sql.avro.{functions => Fns}
32+
import org.apache.spark.sql.avro.functions.{from_avro, to_avro}
3233
import org.apache.spark.sql.execution.LocalTableScanExec
3334
import org.apache.spark.sql.functions.{col, lit, struct}
3435
import org.apache.spark.sql.internal.SQLConf
@@ -665,4 +666,81 @@ class AvroFunctionsSuite extends QueryTest with SharedSparkSession {
665666
checkAnswer(df.select(functions.schema_of_avro(avroMultiType)),
666667
Row("STRUCT<u: STRUCT<member0: INT, member1: STRING> NOT NULL>"))
667668
}
669+
670+
test("roundtrip in to_avro and from_avro - TIME type with different precisions") {
671+
val df = spark.sql("""
672+
SELECT
673+
TIME'12:34:56' as time_p0,
674+
TIME'12:34:56.1' as time_p1,
675+
TIME'12:34:56.12' as time_p2,
676+
TIME'12:34:56.123' as time_p3,
677+
TIME'12:34:56.1234' as time_p4,
678+
TIME'12:34:56.12345' as time_p5,
679+
TIME'12:34:56.123456' as time_p6
680+
""")
681+
682+
val precisions = Seq(0, 1, 2, 3, 4, 5, 6)
683+
precisions.foreach { p =>
684+
val fieldName = s"time_p$p"
685+
// Generate correct schema for each precision
686+
val avroTimeSchema = s"""
687+
|{
688+
| "type": "long",
689+
| "logicalType": "time-micros",
690+
| "spark.sql.catalyst.type": "time($p)"
691+
|}
692+
""".stripMargin
693+
694+
val avroDF = df.select(to_avro(col(fieldName)).as("avro"))
695+
val readBack = avroDF.select(from_avro($"avro", avroTimeSchema).as(fieldName))
696+
697+
checkAnswer(readBack, df.select(col(fieldName)))
698+
}
699+
}
700+
701+
test("roundtrip in to_avro and from_avro - TIME type in struct") {
702+
val df = spark.sql("""
703+
SELECT
704+
struct(
705+
TIME'09:00:00.123' as start,
706+
TIME'17:30:45.987654' as end,
707+
'Morning Shift' as description
708+
) as schedule
709+
""")
710+
711+
val avroStructDF = df.select(to_avro($"schedule").as("avro"))
712+
713+
val avroStructSchema = """
714+
|{
715+
| "type": "record",
716+
| "name": "schedule",
717+
| "fields": [
718+
| {
719+
| "name": "start",
720+
| "type": {
721+
| "type": "long",
722+
| "logicalType": "time-micros",
723+
| "spark.sql.catalyst.type": "time(3)"
724+
| }
725+
| },
726+
| {
727+
| "name": "end",
728+
| "type": {
729+
| "type": "long",
730+
| "logicalType": "time-micros",
731+
| "spark.sql.catalyst.type": "time(6)"
732+
| }
733+
| },
734+
| {
735+
| "name": "description",
736+
| "type": "string"
737+
| }
738+
| ]
739+
|}
740+
""".stripMargin
741+
742+
val readBack = avroStructDF.select(
743+
from_avro($"avro", avroStructSchema).as("schedule"))
744+
checkAnswer(readBack, df)
745+
}
668746
}

0 commit comments

Comments
 (0)