Commit bbb1c25
Fix UUID support (#2007)
# Rationale for this change
The UUID support is a gift that keeps on giving. The current support of
PyIceberg is incomplete, and problematic. Mostly because:
- It is an extension-type in Arrow, which means it is not fully
supported: apache/arrow#46469
apache/arrow#46468
- It doesn't have native support in Spark, where it is converted into a
string. This limits the current tests, which are mostly Spark-based.
I think we have to wait for some fixes in Arrow upstream until we can
fully support this. In PyIceberg, we're converting the `fixed[16]` to a
`UUID`, but Spark does seem to error because the logical type annotation
in Parquet is missing:
```
E py4j.protocol.Py4JJavaError: An error occurred while calling o72.collectToPython.
E : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1) (localhost executor driver): java.lang.UnsupportedOperationException: Unsupported type: UTF8String
E at org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor.getUTF8String(ArrowVectorAccessor.java:81)
E at org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector.getUTF8String(IcebergArrowColumnVector.java:143)
E at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
E at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
E at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
E at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
E at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
E at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
E at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
E at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
E at org.apache.spark.scheduler.Task.run(Task.scala:141)
E at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
E at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
E at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
E at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
E at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
E at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
E at java.base/java.lang.Thread.run(Thread.java:829)
E
E Driver stacktrace:
E at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
E at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
E at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
E at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
E at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
E at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
E at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
E at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
E at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
E at scala.Option.foreach(Option.scala:407)
E at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
E at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
E at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
E at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
E at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
E at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
E at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
E at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4149)
E at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323)
E at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
E at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321)
E at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
E at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
E at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
E at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
E at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
E at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321)
E at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4146)
E at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
E at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
E at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
E at java.base/java.lang.reflect.Method.invoke(Method.java:566)
E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
E at py4j.Gateway.invoke(Gateway.java:282)
E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
E at py4j.commands.CallCommand.execute(CallCommand.java:79)
E at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
E at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
E at java.base/java.lang.Thread.run(Thread.java:829)
E Caused by: java.lang.UnsupportedOperationException: Unsupported type: UTF8String
E at org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor.getUTF8String(ArrowVectorAccessor.java:81)
E at org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector.getUTF8String(IcebergArrowColumnVector.java:143)
E at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
E at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
E at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
E at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
E at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
E at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
E at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
E at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
E at org.apache.spark.scheduler.Task.run(Task.scala:141)
E at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
E at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
E at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
E at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
E at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
E at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
E ... 1 more
```
# Are these changes tested?
# Are there any user-facing changes?
Closes #1986
Closes #2002
<!-- In the case of user-facing changes, please add the changelog label.
-->
---------
Co-authored-by: DinGo4DEV <stanleylkal@gmail.com>1 parent 2b9f9e2 commit bbb1c25
File tree
8 files changed
+88
-35
lines changed- pyiceberg
- avro
- io
- tests
- integration
- test_writes
8 files changed
+88
-35
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
32 | 32 | | |
33 | 33 | | |
34 | 34 | | |
| 35 | + | |
35 | 36 | | |
36 | 37 | | |
37 | 38 | | |
| |||
121 | 122 | | |
122 | 123 | | |
123 | 124 | | |
124 | | - | |
125 | | - | |
| 125 | + | |
| 126 | + | |
| 127 | + | |
| 128 | + | |
| 129 | + | |
126 | 130 | | |
127 | 131 | | |
128 | 132 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
746 | 746 | | |
747 | 747 | | |
748 | 748 | | |
749 | | - | |
| 749 | + | |
750 | 750 | | |
751 | 751 | | |
752 | 752 | | |
| |||
1307 | 1307 | | |
1308 | 1308 | | |
1309 | 1309 | | |
| 1310 | + | |
| 1311 | + | |
1310 | 1312 | | |
1311 | 1313 | | |
1312 | 1314 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
467 | 467 | | |
468 | 468 | | |
469 | 469 | | |
470 | | - | |
471 | | - | |
| 470 | + | |
| 471 | + | |
| 472 | + | |
| 473 | + | |
| 474 | + | |
| 475 | + | |
| 476 | + | |
| 477 | + | |
| 478 | + | |
| 479 | + | |
| 480 | + | |
472 | 481 | | |
473 | 482 | | |
474 | 483 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
2827 | 2827 | | |
2828 | 2828 | | |
2829 | 2829 | | |
2830 | | - | |
| 2830 | + | |
2831 | 2831 | | |
2832 | 2832 | | |
2833 | 2833 | | |
| |||
2843 | 2843 | | |
2844 | 2844 | | |
2845 | 2845 | | |
2846 | | - | |
| 2846 | + | |
| 2847 | + | |
| 2848 | + | |
| 2849 | + | |
2847 | 2850 | | |
2848 | 2851 | | |
2849 | 2852 | | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
737 | 737 | | |
738 | 738 | | |
739 | 739 | | |
740 | | - | |
| 740 | + | |
741 | 741 | | |
742 | 742 | | |
743 | 743 | | |
| |||
747 | 747 | | |
748 | 748 | | |
749 | 749 | | |
750 | | - | |
| 750 | + | |
751 | 751 | | |
752 | 752 | | |
753 | 753 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
15 | 15 | | |
16 | 16 | | |
17 | 17 | | |
18 | | - | |
19 | 18 | | |
20 | 19 | | |
21 | 20 | | |
| |||
308 | 307 | | |
309 | 308 | | |
310 | 309 | | |
311 | | - | |
312 | | - | |
313 | | - | |
314 | | - | |
315 | | - | |
316 | | - | |
317 | | - | |
318 | | - | |
319 | | - | |
320 | | - | |
321 | | - | |
322 | | - | |
323 | | - | |
324 | | - | |
325 | | - | |
326 | | - | |
327 | | - | |
328 | | - | |
329 | | - | |
330 | 310 | | |
331 | 311 | | |
332 | 312 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
589 | 589 | | |
590 | 590 | | |
591 | 591 | | |
592 | | - | |
| 592 | + | |
593 | 593 | | |
594 | 594 | | |
595 | 595 | | |
596 | 596 | | |
597 | 597 | | |
598 | | - | |
599 | | - | |
600 | | - | |
| 598 | + | |
| 599 | + | |
| 600 | + | |
601 | 601 | | |
602 | 602 | | |
603 | 603 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
19 | 19 | | |
20 | 20 | | |
21 | 21 | | |
| 22 | + | |
22 | 23 | | |
23 | 24 | | |
24 | 25 | | |
| |||
49 | 50 | | |
50 | 51 | | |
51 | 52 | | |
52 | | - | |
| 53 | + | |
53 | 54 | | |
54 | 55 | | |
55 | 56 | | |
| |||
59 | 60 | | |
60 | 61 | | |
61 | 62 | | |
| 63 | + | |
62 | 64 | | |
63 | 65 | | |
64 | 66 | | |
| |||
1286 | 1288 | | |
1287 | 1289 | | |
1288 | 1290 | | |
1289 | | - | |
| 1291 | + | |
1290 | 1292 | | |
1291 | 1293 | | |
1292 | 1294 | | |
| |||
1858 | 1860 | | |
1859 | 1861 | | |
1860 | 1862 | | |
| 1863 | + | |
| 1864 | + | |
| 1865 | + | |
| 1866 | + | |
| 1867 | + | |
| 1868 | + | |
| 1869 | + | |
| 1870 | + | |
| 1871 | + | |
| 1872 | + | |
| 1873 | + | |
| 1874 | + | |
| 1875 | + | |
| 1876 | + | |
| 1877 | + | |
| 1878 | + | |
| 1879 | + | |
| 1880 | + | |
| 1881 | + | |
| 1882 | + | |
| 1883 | + | |
| 1884 | + | |
| 1885 | + | |
| 1886 | + | |
| 1887 | + | |
| 1888 | + | |
| 1889 | + | |
| 1890 | + | |
| 1891 | + | |
| 1892 | + | |
| 1893 | + | |
| 1894 | + | |
| 1895 | + | |
| 1896 | + | |
| 1897 | + | |
| 1898 | + | |
| 1899 | + | |
| 1900 | + | |
| 1901 | + | |
| 1902 | + | |
| 1903 | + | |
| 1904 | + | |
| 1905 | + | |
| 1906 | + | |
| 1907 | + | |
| 1908 | + | |
| 1909 | + | |
| 1910 | + | |
| 1911 | + | |
| 1912 | + | |
| 1913 | + | |
| 1914 | + | |
| 1915 | + | |
1861 | 1916 | | |
1862 | 1917 | | |
1863 | 1918 | | |
| |||
0 commit comments