From dfea6a4fba222099888fae70d804822f6d3b9486 Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Fri, 25 Apr 2025 21:50:03 +0530 Subject: [PATCH 01/16] single commit --- xtable-utilities/src/test/resources/my_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xtable-utilities/src/test/resources/my_config.yaml b/xtable-utilities/src/test/resources/my_config.yaml index 1416c04c2..f0594eb9f 100644 --- a/xtable-utilities/src/test/resources/my_config.yaml +++ b/xtable-utilities/src/test/resources/my_config.yaml @@ -19,6 +19,6 @@ targetFormats: - DELTA datasets: - - tableBasePath: /Desktop/opensource/iceberg/warehouse/demo/nyc/taxis - tableDataPath: /Desktop/opensource/iceberg/warehouse/demo/nyc/taxis/data + tableBasePath: /Users/vaibhakumar/Desktop/opensource/iceberg/warehouse/demo/nyc/taxis + tableDataPath: /Users/vaibhakumar/Desktop/opensource/iceberg/warehouse/demo/nyc/taxis/data tableName: taxis \ No newline at end of file From b75bc7caa7275bfde5c0d3a9bcf9142c72c6a67d Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Sat, 17 May 2025 00:08:43 +0530 Subject: [PATCH 02/16] adding delta kernel --- pom.xml | 2 +- xtable-core/pom.xml | 13 +++++ .../org/apache/xtable/DeltaTableKernel.java | 47 +++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java diff --git a/pom.xml b/pom.xml index bed4d63b4..db995a624 100644 --- a/pom.xml +++ b/pom.xml @@ -53,7 +53,7 @@ xtable-utilities xtable-aws xtable-hive-metastore - xtable-service + diff --git a/xtable-core/pom.xml b/xtable-core/pom.xml index 24bc31df5..42e1f2527 100644 --- a/xtable-core/pom.xml +++ b/xtable-core/pom.xml @@ -110,6 +110,19 @@ test + + io.delta + delta-kernel-api + 3.1.0 + + + + io.delta + delta-kernel-defaults + 3.1.0 + + + org.apache.hadoop diff --git a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java new file mode 100644 index 000000000..266647fbb --- /dev/null +++ b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable; + +// import org.junit.jupiter.api.Test; +// +import io.delta.kernel.*; + import io.delta.kernel.defaults.*; +// import org.apache.hadoop.conf.Configuration; + +public class DeltaTableKernel { + // @Test + public void readDeltaKernel() { + // String myTablePath + // ="/Users/vaibhakumar/Desktop/opensource/iceberg/warehouse/demo/nyc/taxis"; // fully qualified + // table path. Ex: file:/user/tables/myTable + // Configuration hadoopConf = new Configuration(); + // Engine myEngine = DefaultEngine.create(hadoopConf); + // Table myTable = Table.forPath(myEngine, myTablePath); + // Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); + // long version = mySnapshot.getVersion(); + // StructType tableSchema = mySnapshot.getSchema(); + // Scan myScan = mySnapshot.getScanBuilder(myEngine).build(); + + // Common information about scanning for all data files to read. + // Row scanState = myScan.getScanState(myEngine); + + // Information about the list of scan files to read + // CloseableIterator scanFiles = myScan.getScanFiles(myEngine); + } +} From 16134b34874f7688fafff5b6f9b3648fbd0caa71 Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Mon, 19 May 2025 23:01:16 +0530 Subject: [PATCH 03/16] adding the test file --- xtable-core/pom.xml | 4 +- .../org/apache/xtable/DeltaTableKernel.java | 100 +++++++++++++++--- 2 files changed, 86 insertions(+), 18 deletions(-) diff --git a/xtable-core/pom.xml b/xtable-core/pom.xml index 42e1f2527..1e4b2f337 100644 --- a/xtable-core/pom.xml +++ b/xtable-core/pom.xml @@ -113,13 +113,13 @@ io.delta delta-kernel-api - 3.1.0 + 3.3.1 io.delta delta-kernel-defaults - 3.1.0 + 3.3.1 diff --git a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java index 266647fbb..71a8bde6c 100644 --- a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java +++ b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java @@ -20,28 +20,96 @@ // import org.junit.jupiter.api.Test; // +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.junit.jupiter.api.Test; +import java.util.Optional; + import io.delta.kernel.*; - import io.delta.kernel.defaults.*; -// import org.apache.hadoop.conf.Configuration; +import io.delta.kernel.defaults.*; +import org.apache.hadoop.conf.Configuration; +import io.delta.kernel.engine.Engine; +import io.delta.kernel.defaults.engine.DefaultEngine; +import io.delta.kernel.types.StructType; +import io.delta.kernel.data.Row; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.data.FilteredColumnarBatch; +import io.delta.kernel.internal.data.ScanStateRow; +import io.delta.kernel.utils.FileStatus; +import io.delta.kernel.internal.InternalScanFileUtils; +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.data.ColumnVector; +import static io.delta.kernel.internal.util.Utils.singletonCloseableIterator; public class DeltaTableKernel { - // @Test + private static final Logger logger = LoggerFactory.getLogger(DeltaTableKernel.class); + @Test public void readDeltaKernel() { - // String myTablePath - // ="/Users/vaibhakumar/Desktop/opensource/iceberg/warehouse/demo/nyc/taxis"; // fully qualified - // table path. Ex: file:/user/tables/myTable - // Configuration hadoopConf = new Configuration(); - // Engine myEngine = DefaultEngine.create(hadoopConf); - // Table myTable = Table.forPath(myEngine, myTablePath); - // Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); - // long version = mySnapshot.getVersion(); - // StructType tableSchema = mySnapshot.getSchema(); - // Scan myScan = mySnapshot.getScanBuilder(myEngine).build(); + logger.info("hello"); + String myTablePath ="/Users/vaibhakumar/Desktop/opensource/iceberg/warehouse/demo/nyc/taxis"; // fully qualified + Configuration hadoopConf = new Configuration(); + Engine myEngine = DefaultEngine.create(hadoopConf); + + Table myTable = Table.forPath(myEngine, myTablePath); + Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); + long version = mySnapshot.getVersion(myEngine); + StructType tableSchema = mySnapshot.getSchema(myEngine); + Scan myScan = mySnapshot.getScanBuilder(myEngine).build(); // Common information about scanning for all data files to read. - // Row scanState = myScan.getScanState(myEngine); + Row scanState = myScan.getScanState(myEngine); // Information about the list of scan files to read - // CloseableIterator scanFiles = myScan.getScanFiles(myEngine); - } + CloseableIterator fileIter = myScan.getScanFiles(myEngine); + int readRecordCount = 0; + try { + StructType physicalReadSchema = + ScanStateRow.getPhysicalDataReadSchema(myEngine, scanState); + while (fileIter.hasNext()) { + FilteredColumnarBatch scanFilesBatch = fileIter.next(); + try (CloseableIterator scanFileRows = scanFilesBatch.getRows()) { + while (scanFileRows.hasNext()) { + Row scanFileRow = scanFileRows.next(); + FileStatus fileStatus = + InternalScanFileUtils.getAddFileStatus(scanFileRow); + CloseableIterator physicalDataIter = + myEngine.getParquetHandler().readParquetFiles( + singletonCloseableIterator(fileStatus), + physicalReadSchema, + Optional.empty()); + try ( + CloseableIterator transformedData = + Scan.transformPhysicalData( + myEngine, + scanState, + scanFileRow, + physicalDataIter)) { + while (transformedData.hasNext()) { + FilteredColumnarBatch logicalData = transformedData.next(); + ColumnarBatch dataBatch = logicalData.getData(); +// Optional selectionVector = dataReadResult.getSelectionVector(); + + // access the data for the column at ordinal 0 + ColumnVector column0 = dataBatch.getColumnVector(0); + for (int rowIndex = 0; rowIndex < column0.getSize(); rowIndex++) { + // check if the row is selected or not + + // Assuming the column type is String. + // If it is a different type, call the relevant function on the `ColumnVector` + System.out.println(column0.getString(rowIndex)); + + } + + } + } + } + } + } + } finally { + fileIter.close(); + } + + + + } } From 3929e95a76b205df405fe02d7eb3ec1eadfd8039 Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Mon, 26 May 2025 22:50:05 +0530 Subject: [PATCH 04/16] adding workable code for iteration over data --- .../org/apache/xtable/DeltaTableKernel.java | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java index 71a8bde6c..7dedf12cf 100644 --- a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java +++ b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java @@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory; import org.junit.jupiter.api.Test; import java.util.Optional; +import java.io.IOException; import io.delta.kernel.*; import io.delta.kernel.defaults.*; @@ -44,12 +45,10 @@ public class DeltaTableKernel { private static final Logger logger = LoggerFactory.getLogger(DeltaTableKernel.class); @Test - public void readDeltaKernel() { - logger.info("hello"); + public void readDeltaKernel() throws IOException{ String myTablePath ="/Users/vaibhakumar/Desktop/opensource/iceberg/warehouse/demo/nyc/taxis"; // fully qualified Configuration hadoopConf = new Configuration(); Engine myEngine = DefaultEngine.create(hadoopConf); - Table myTable = Table.forPath(myEngine, myTablePath); Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); long version = mySnapshot.getVersion(myEngine); @@ -87,26 +86,31 @@ public void readDeltaKernel() { while (transformedData.hasNext()) { FilteredColumnarBatch logicalData = transformedData.next(); ColumnarBatch dataBatch = logicalData.getData(); -// Optional selectionVector = dataReadResult.getSelectionVector(); + // access the data for the column at ordinal 0 ColumnVector column0 = dataBatch.getColumnVector(0); - for (int rowIndex = 0; rowIndex < column0.getSize(); rowIndex++) { - // check if the row is selected or not + ColumnVector column1 = dataBatch.getColumnVector(1); + ColumnVector column2 = dataBatch.getColumnVector(2); + ColumnVector column3 = dataBatch.getColumnVector(3); - // Assuming the column type is String. - // If it is a different type, call the relevant function on the `ColumnVector` - System.out.println(column0.getString(rowIndex)); + for (int rowIndex = 0; rowIndex < column0.getSize() ; rowIndex++) { + System.out.println(column0.getInt(rowIndex)); } + for (int rowIndex = 0; rowIndex < column1.getSize() ; rowIndex++) { + System.out.println(column1.getString(rowIndex)); + } } } } } } - } finally { - fileIter.close(); + } catch (IOException e) + { + e.printStackTrace(); + System.out.println("IOException occurred: " + e.getMessage()); } From c6379b594054bfbe2f73e4381ec713eb989e2d8f Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Wed, 11 Jun 2025 20:53:07 +0530 Subject: [PATCH 05/16] adding Kernel 4.0 code --- xtable-core/pom.xml | 4 +- .../org/apache/xtable/DeltaTableKernel.java | 146 +++++++++--------- 2 files changed, 71 insertions(+), 79 deletions(-) diff --git a/xtable-core/pom.xml b/xtable-core/pom.xml index 1e4b2f337..e926bb6d7 100644 --- a/xtable-core/pom.xml +++ b/xtable-core/pom.xml @@ -113,13 +113,13 @@ io.delta delta-kernel-api - 3.3.1 + 4.0.0 io.delta delta-kernel-defaults - 3.3.1 + 4.0.0 diff --git a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java index 7dedf12cf..64506d2e0 100644 --- a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java +++ b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java @@ -15,105 +15,97 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - + package org.apache.xtable; // import org.junit.jupiter.api.Test; // +import static io.delta.kernel.internal.util.Utils.singletonCloseableIterator; + +import java.io.IOException; +import java.util.Optional; + +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.junit.jupiter.api.Test; -import java.util.Optional; -import java.io.IOException; import io.delta.kernel.*; +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.data.FilteredColumnarBatch; +import io.delta.kernel.data.Row; import io.delta.kernel.defaults.*; -import org.apache.hadoop.conf.Configuration; -import io.delta.kernel.engine.Engine; import io.delta.kernel.defaults.engine.DefaultEngine; +import io.delta.kernel.engine.Engine; +import io.delta.kernel.internal.InternalScanFileUtils; +import io.delta.kernel.internal.data.ScanStateRow; import io.delta.kernel.types.StructType; -import io.delta.kernel.data.Row; import io.delta.kernel.utils.CloseableIterator; -import io.delta.kernel.data.FilteredColumnarBatch; -import io.delta.kernel.internal.data.ScanStateRow; import io.delta.kernel.utils.FileStatus; -import io.delta.kernel.internal.InternalScanFileUtils; -import io.delta.kernel.data.ColumnarBatch; -import io.delta.kernel.data.ColumnVector; -import static io.delta.kernel.internal.util.Utils.singletonCloseableIterator; public class DeltaTableKernel { - private static final Logger logger = LoggerFactory.getLogger(DeltaTableKernel.class); - @Test - public void readDeltaKernel() throws IOException{ - String myTablePath ="/Users/vaibhakumar/Desktop/opensource/iceberg/warehouse/demo/nyc/taxis"; // fully qualified - Configuration hadoopConf = new Configuration(); - Engine myEngine = DefaultEngine.create(hadoopConf); - Table myTable = Table.forPath(myEngine, myTablePath); - Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); - long version = mySnapshot.getVersion(myEngine); - StructType tableSchema = mySnapshot.getSchema(myEngine); - Scan myScan = mySnapshot.getScanBuilder(myEngine).build(); + private static final Logger logger = LoggerFactory.getLogger(DeltaTableKernel.class); + + @Test + public void readDeltaKernel() throws IOException { + String myTablePath = + "/Users/vaibhakumar/Desktop/opensource/iceberg/warehouse/demo/nyc/taxis"; // fully qualified + Configuration hadoopConf = new Configuration(); + Engine myEngine = DefaultEngine.create(hadoopConf); + Table myTable = Table.forPath(myEngine, myTablePath); + Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); + long version = mySnapshot.getVersion(); + StructType tableSchema = mySnapshot.getSchema(); + Scan myScan = mySnapshot.getScanBuilder().build(); // Common information about scanning for all data files to read. - Row scanState = myScan.getScanState(myEngine); + Row scanState = myScan.getScanState(myEngine); // Information about the list of scan files to read - CloseableIterator fileIter = myScan.getScanFiles(myEngine); - int readRecordCount = 0; - try { - StructType physicalReadSchema = - ScanStateRow.getPhysicalDataReadSchema(myEngine, scanState); - while (fileIter.hasNext()) { - FilteredColumnarBatch scanFilesBatch = fileIter.next(); - try (CloseableIterator scanFileRows = scanFilesBatch.getRows()) { - while (scanFileRows.hasNext()) { - Row scanFileRow = scanFileRows.next(); - FileStatus fileStatus = - InternalScanFileUtils.getAddFileStatus(scanFileRow); - CloseableIterator physicalDataIter = - myEngine.getParquetHandler().readParquetFiles( - singletonCloseableIterator(fileStatus), - physicalReadSchema, - Optional.empty()); - try ( - CloseableIterator transformedData = - Scan.transformPhysicalData( - myEngine, - scanState, - scanFileRow, - physicalDataIter)) { - while (transformedData.hasNext()) { - FilteredColumnarBatch logicalData = transformedData.next(); - ColumnarBatch dataBatch = logicalData.getData(); - - - // access the data for the column at ordinal 0 - ColumnVector column0 = dataBatch.getColumnVector(0); - ColumnVector column1 = dataBatch.getColumnVector(1); - ColumnVector column2 = dataBatch.getColumnVector(2); - ColumnVector column3 = dataBatch.getColumnVector(3); + CloseableIterator fileIter = myScan.getScanFiles(myEngine); + int readRecordCount = 0; + try { + StructType physicalReadSchema = ScanStateRow.getPhysicalDataReadSchema(myEngine, scanState); + while (fileIter.hasNext()) { + FilteredColumnarBatch scanFilesBatch = fileIter.next(); + try (CloseableIterator scanFileRows = scanFilesBatch.getRows()) { + while (scanFileRows.hasNext()) { + Row scanFileRow = scanFileRows.next(); + FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); + CloseableIterator physicalDataIter = + myEngine + .getParquetHandler() + .readParquetFiles( + singletonCloseableIterator(fileStatus), + physicalReadSchema, + Optional.empty()); + try (CloseableIterator transformedData = + Scan.transformPhysicalData(myEngine, scanState, scanFileRow, physicalDataIter)) { + while (transformedData.hasNext()) { + FilteredColumnarBatch logicalData = transformedData.next(); + ColumnarBatch dataBatch = logicalData.getData(); - for (int rowIndex = 0; rowIndex < column0.getSize() ; rowIndex++) { - System.out.println(column0.getInt(rowIndex)); + // access the data for the column at ordinal 0 + ColumnVector column0 = dataBatch.getColumnVector(0); + ColumnVector column1 = dataBatch.getColumnVector(1); + ColumnVector column2 = dataBatch.getColumnVector(2); + ColumnVector column3 = dataBatch.getColumnVector(3); - } - for (int rowIndex = 0; rowIndex < column1.getSize() ; rowIndex++) { - System.out.println(column1.getString(rowIndex)); - - } - } - } - } - } + for (int rowIndex = 0; rowIndex < column0.getSize(); rowIndex++) { + System.out.println(column0.getInt(rowIndex)); + } + for (int rowIndex = 0; rowIndex < column1.getSize(); rowIndex++) { + System.out.println(column1.getString(rowIndex)); + } } - } catch (IOException e) - { - e.printStackTrace(); - System.out.println("IOException occurred: " + e.getMessage()); + } } - - - + } } + } catch (IOException e) { + e.printStackTrace(); + System.out.println("IOException occurred: " + e.getMessage()); + } + } } From 6deb5f7d8f9e0a2cc5ba17ae65f3c6cd72aa7c1a Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Tue, 24 Jun 2025 23:40:12 +0530 Subject: [PATCH 06/16] adding the working code with xtable that check getcurrenttable --- .../DeltaKernelConversionSourceProvider.java | 42 + .../delta/DeltaKernelSchemaExtractor.java | 119 ++ .../delta/DeltaKernelTableExtractor.java | 104 ++ .../xtable/delta/DeltaSchemaExtractor.java | 18 +- .../xtable/delta/DeltaTableExtractor.java | 2 +- .../xtable/hudi/HudiTableExtractor.java | 2 +- .../iceberg/IcebergConversionSource.java | 2 +- .../kernel/DeltaKernelConversionSource.java | 131 ++ .../org/apache/xtable/DeltaTableKernel.java | 2 +- .../xtable/delta/ITDeltaConversionSource.java | 1162 ++++++++--------- .../delta/ITDeltaKernelConversionSource.java | 164 +++ .../xtable/hudi/ITHudiConversionSource.java | 2 +- .../apache/xtable/testutil/ITTestUtils.java | 3 +- 13 files changed, 1138 insertions(+), 615 deletions(-) create mode 100644 xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelConversionSourceProvider.java create mode 100644 xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java create mode 100644 xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelTableExtractor.java create mode 100644 xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java create mode 100644 xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelConversionSourceProvider.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelConversionSourceProvider.java new file mode 100644 index 000000000..c81353dac --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelConversionSourceProvider.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.delta; + +import org.apache.hadoop.conf.Configuration; + +import io.delta.kernel.defaults.engine.DefaultEngine; +import io.delta.kernel.engine.Engine; + +import org.apache.xtable.conversion.ConversionSourceProvider; +import org.apache.xtable.conversion.SourceTable; +import org.apache.xtable.kernel.DeltaKernelConversionSource; + +public class DeltaKernelConversionSourceProvider extends ConversionSourceProvider { + @Override + public DeltaKernelConversionSource getConversionSourceInstance(SourceTable sourceTable) { + Configuration hadoopConf = new Configuration(); + Engine engine = DefaultEngine.create(hadoopConf); + // DeltaTable deltaTable = DeltaT/able.forPath(sourceTable.getBasePath()); + return DeltaKernelConversionSource.builder() + .tableName(sourceTable.getName()) + .basePath(sourceTable.getBasePath()) + .engine(engine) + .build(); + } +} diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java new file mode 100644 index 000000000..f0fc18736 --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.delta; + +import java.util.*; + +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.IntegerType; +import io.delta.kernel.types.StringType; +import io.delta.kernel.types.StructType; + +import org.apache.xtable.collectors.CustomCollectors; +import org.apache.xtable.model.schema.InternalField; +import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.schema.InternalType; +import org.apache.xtable.schema.SchemaUtils; + +public class DeltaKernelSchemaExtractor { + + private static final String DELTA_COLUMN_MAPPING_ID = "delta.columnMapping.id"; + private static final DeltaKernelSchemaExtractor INSTANCE = new DeltaKernelSchemaExtractor(); + private static final Map + DEFAULT_TIMESTAMP_PRECISION_METADATA = + Collections.singletonMap( + InternalSchema.MetadataKey.TIMESTAMP_PRECISION, InternalSchema.MetadataValue.MICROS); + + public static DeltaKernelSchemaExtractor getInstance() { + return INSTANCE; + } + + public InternalSchema toInternalSchema_v2(StructType structType) { + return toInternalSchema_v2(structType, null, false, null); + } + + String trimmedTypeName = ""; + + private InternalSchema toInternalSchema_v2( + DataType dataType, String parentPath, boolean nullable, String comment) { + + Map metadata = null; + List fields = null; + InternalType type = null; + if (dataType instanceof IntegerType) { + type = InternalType.INT; + trimmedTypeName = "integer"; + } + if (dataType instanceof StringType) { + type = InternalType.STRING; + trimmedTypeName = "string"; + } + if (dataType instanceof StructType) { + // Handle StructType + StructType structType = (StructType) dataType; + // your logic here + + fields = + structType.fields().stream() + .filter( + field -> + !field + .getMetadata() + .contains(DeltaPartitionExtractor.DELTA_GENERATION_EXPRESSION)) + .map( + field -> { + Integer fieldId = + field.getMetadata().contains(DELTA_COLUMN_MAPPING_ID) + ? Long.valueOf(field.getMetadata().getLong(DELTA_COLUMN_MAPPING_ID)) + .intValue() + : null; + String fieldComment = + field.getMetadata().contains("comment") + ? field.getMetadata().getString("comment") + : null; + InternalSchema schema = + toInternalSchema_v2( + field.getDataType(), + SchemaUtils.getFullyQualifiedPath(parentPath, field.getName()), + field.isNullable(), + fieldComment); + return InternalField.builder() + .name(field.getName()) + .fieldId(fieldId) + .parentPath(parentPath) + .schema(schema) + .defaultValue( + field.isNullable() ? InternalField.Constants.NULL_DEFAULT_VALUE : null) + .build(); + }) + .collect(CustomCollectors.toList(structType.fields().size())); + type = InternalType.RECORD; + trimmedTypeName = "struct"; + } + + return InternalSchema.builder() + .name(trimmedTypeName) + .dataType(type) + .comment(comment) + .isNullable(nullable) + .metadata(metadata) + .fields(fields) + .build(); + } +} diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelTableExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelTableExtractor.java new file mode 100644 index 000000000..f99d31c32 --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelTableExtractor.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.delta; + +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; + +import lombok.Builder; + +import io.delta.kernel.*; +import io.delta.kernel.engine.Engine; + +import org.apache.xtable.model.InternalTable; +import org.apache.xtable.model.schema.InternalField; +import org.apache.xtable.model.schema.InternalPartitionField; +import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.schema.InternalType; +import org.apache.xtable.model.storage.DataLayoutStrategy; +import org.apache.xtable.model.storage.TableFormat; + +/** + * Extracts {@link InternalTable} canonical representation of a table at a point in time for Delta. + */ +@Builder +public class DeltaKernelTableExtractor { + @Builder.Default + private static final DeltaKernelSchemaExtractor schemaExtractor = + DeltaKernelSchemaExtractor.getInstance(); + + private final String basePath; + + public InternalTable table( + Table deltaKernelTable, Snapshot snapshot, Engine engine, String tableName, String basePath) { + try { + // Get schema from Delta Kernel's snapshot + io.delta.kernel.types.StructType schema = snapshot.getSchema(); + + System.out.println("Kernelschema: " + schema); + + InternalSchema internalSchema = schemaExtractor.toInternalSchema_v2(schema); + // io.delta.kernel.types.StructType schema = snapshot.getSchema(); + //// InternalSchema internalSchema = schemaExtractor.toInternalSchema_v2(schema); + // InternalSchema internalSchema = + // schemaExtractor.toInternalSchema(snapshot.getSchema()); + + // Get partition columns + System.out.println("Partition columns: " + internalSchema); + List partitionColumnNames = snapshot.getPartitionColumnNames(); + List partitionFields = new ArrayList<>(); + for (String columnName : partitionColumnNames) { + InternalField sourceField = + InternalField.builder() + .name(columnName) + .schema( + InternalSchema.builder() + .name(columnName) + .dataType(InternalType.STRING) // Assuming string type for partition columns + .build()) + .build(); + + // Create the partition field with the source field + partitionFields.add(InternalPartitionField.builder().sourceField(sourceField).build()); + } + + DataLayoutStrategy dataLayoutStrategy = + partitionFields.isEmpty() + ? DataLayoutStrategy.FLAT + : DataLayoutStrategy.HIVE_STYLE_PARTITION; + + // Get the timestamp + long timestamp = snapshot.getTimestamp(engine) * 1000; // Convert to milliseconds + System.out.println("InternalTable basepath" + basePath); + return InternalTable.builder() + .tableFormat(TableFormat.DELTA) + .basePath(basePath) + .name(tableName) + .layoutStrategy(dataLayoutStrategy) + .partitioningFields(partitionFields) + .readSchema(internalSchema) + .latestCommitTime(Instant.ofEpochMilli(timestamp)) + .latestMetadataPath(basePath + "/_delta_log") + .build(); + } catch (Exception e) { + throw new RuntimeException("Failed to extract table information using Delta Kernel", e); + } + } +} diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaSchemaExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaSchemaExtractor.java index 1376f884e..3b770adf0 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaSchemaExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaSchemaExtractor.java @@ -18,11 +18,7 @@ package org.apache.xtable.delta; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import lombok.AccessLevel; import lombok.NoArgsConstructor; @@ -41,22 +37,10 @@ import org.apache.xtable.model.schema.InternalType; import org.apache.xtable.schema.SchemaUtils; -/** - * Converts between Delta and InternalTable schemas. Some items to be aware of: - * - *
    - *
  • Delta schemas are represented as Spark StructTypes which do not have enums so the enum - * types are lost when converting from XTable to Delta Lake representations - *
  • Delta does not have a fixed length byte array option so {@link InternalType#FIXED} is - * simply translated to a {@link org.apache.spark.sql.types.BinaryType} - *
  • Similarly, {@link InternalType#TIMESTAMP_NTZ} is translated to a long in Delta Lake - *
- */ @NoArgsConstructor(access = AccessLevel.PRIVATE) public class DeltaSchemaExtractor { private static final String DELTA_COLUMN_MAPPING_ID = "delta.columnMapping.id"; private static final DeltaSchemaExtractor INSTANCE = new DeltaSchemaExtractor(); - // Timestamps in Delta are microsecond precision by default private static final Map DEFAULT_TIMESTAMP_PRECISION_METADATA = Collections.singletonMap( diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaTableExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaTableExtractor.java index 1929974eb..731b5c300 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaTableExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaTableExtractor.java @@ -62,7 +62,7 @@ public InternalTable table(DeltaLog deltaLog, String tableName, Long version) { .partitioningFields(partitionFields) .readSchema(schema) .latestCommitTime(Instant.ofEpochMilli(snapshot.timestamp())) - .latestMetdataPath(snapshot.deltaLog().logPath().toString()) + .latestMetadataPath(snapshot.deltaLog().logPath().toString()) .build(); } } diff --git a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiTableExtractor.java b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiTableExtractor.java index dd5996a77..795f651ce 100644 --- a/xtable-core/src/main/java/org/apache/xtable/hudi/HudiTableExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/hudi/HudiTableExtractor.java @@ -87,7 +87,7 @@ public InternalTable table(HoodieTableMetaClient metaClient, HoodieInstant commi .partitioningFields(partitionFields) .readSchema(canonicalSchema) .latestCommitTime(HudiInstantUtils.parseFromInstantTime(commit.getTimestamp())) - .latestMetdataPath(metaClient.getMetaPath().toString()) + .latestMetadataPath(metaClient.getMetaPath().toString()) .build(); } diff --git a/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergConversionSource.java index fe28be0d4..7a777ddb1 100644 --- a/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/iceberg/IcebergConversionSource.java @@ -131,7 +131,7 @@ public InternalTable getTable(Snapshot snapshot) { .latestCommitTime(Instant.ofEpochMilli(snapshot.timestampMillis())) .readSchema(irSchema) .layoutStrategy(dataLayoutStrategy) - .latestMetdataPath(iceOps.current().metadataFileLocation()) + .latestMetadataPath(iceOps.current().metadataFileLocation()) .build(); } diff --git a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java new file mode 100644 index 000000000..f56f333b0 --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.kernel; + +import java.io.IOException; +import java.time.Instant; + +import lombok.Builder; + +import org.apache.hadoop.conf.Configuration; + +import io.delta.kernel.Snapshot; +import io.delta.kernel.Table; +import io.delta.kernel.defaults.engine.DefaultEngine; +import io.delta.kernel.engine.Engine; + +import org.apache.xtable.delta.DeltaKernelTableExtractor; +import org.apache.xtable.exception.ReadException; +import org.apache.xtable.model.*; +import org.apache.xtable.spi.extractor.ConversionSource; + +@Builder +public class DeltaKernelConversionSource implements ConversionSource { + private final String basePath; + private final String tableName; + private final Engine engine; + // private final DeltaKernelTableExtractor tableExtractor; + + @Builder.Default + private final DeltaKernelTableExtractor tableExtractor = + DeltaKernelTableExtractor.builder().build(); + // private final DeltaKernelActionsConverter actionsConverter; + + // public DeltaKernelConversionSource(String basePath, String tableName, Engine engine) { + // this.basePath = basePath; + // this.tableName = tableName; + // this.engine = engine; + // + // } + + @Override + public InternalTable getTable(Long version) { + Configuration hadoopConf = new Configuration(); + try { + Engine engine = DefaultEngine.create(hadoopConf); + Table table = Table.forPath(engine, basePath); + Snapshot snapshot = table.getSnapshotAsOfVersion(engine, version); + System.out.println("getTable: " + basePath); + return tableExtractor.table(table, snapshot, engine, tableName, basePath); + } catch (Exception e) { + throw new ReadException("Failed to get table at version " + version, e); + } + } + + @Override + public InternalTable getCurrentTable() { + Configuration hadoopConf = new Configuration(); + Engine engine = DefaultEngine.create(hadoopConf); + Table table = Table.forPath(engine, basePath); + System.out.println("getCurrentTable: " + basePath); + Snapshot snapshot = table.getLatestSnapshot(engine); + return getTable(snapshot.getVersion()); + } + + @Override + public InternalSnapshot getCurrentSnapshot() { + return null; + } + + @Override + public TableChange getTableChangeForCommit(Long aLong) { + return null; + } + + @Override + public CommitsBacklog getCommitsBacklog( + InstantsForIncrementalSync instantsForIncrementalSync) { + return null; + } + + @Override + public boolean isIncrementalSyncSafeFrom(Instant instant) { + return false; + } + + @Override + public String getCommitIdentifier(Long aLong) { + return ""; + } + + @Override + public void close() throws IOException {} + + // + // @Override + // public InternalSnapshot getCurrentSnapshot() { + // throw new UnsupportedOperationException("Not implemented yet"); + // } + // + // @Override + // public TableChange getTableChangeForCommit(Long commit) { + // throw new UnsupportedOperationException("Not implemented yet"); + // } + // + // @Override + // public CommitsBacklog getCommitsBacklog(InstantsForIncrementalSync + // instantsForIncrementalSync) { + // throw new UnsupportedOperationException("Not implemented yet"); + // } + // + // @Override + // public void close() { + // // No resources to close + // } +} diff --git a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java index 64506d2e0..050d12e64 100644 --- a/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java +++ b/xtable-core/src/test/java/org/apache/xtable/DeltaTableKernel.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - + package org.apache.xtable; // import org.junit.jupiter.api.Test; diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaConversionSource.java index 0685e9192..ba9a4eadf 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaConversionSource.java @@ -21,55 +21,29 @@ import static org.apache.xtable.testutil.ITTestUtils.validateTable; import static org.junit.jupiter.api.Assertions.*; -import java.net.URI; -import java.net.URISyntaxException; import java.nio.file.Path; -import java.nio.file.Paths; -import java.time.Instant; -import java.time.temporal.ChronoUnit; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.hadoop.conf.Configuration; import org.apache.spark.serializer.KryoSerializer; -import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; import org.apache.xtable.GenericTable; -import org.apache.xtable.TestSparkDeltaTable; -import org.apache.xtable.ValidationTestHelper; import org.apache.xtable.conversion.SourceTable; -import org.apache.xtable.model.CommitsBacklog; -import org.apache.xtable.model.InstantsForIncrementalSync; -import org.apache.xtable.model.InternalSnapshot; import org.apache.xtable.model.InternalTable; -import org.apache.xtable.model.TableChange; import org.apache.xtable.model.schema.InternalField; -import org.apache.xtable.model.schema.InternalPartitionField; import org.apache.xtable.model.schema.InternalSchema; import org.apache.xtable.model.schema.InternalType; -import org.apache.xtable.model.schema.PartitionTransformType; import org.apache.xtable.model.stat.ColumnStat; -import org.apache.xtable.model.stat.PartitionValue; import org.apache.xtable.model.stat.Range; import org.apache.xtable.model.storage.*; -import org.apache.xtable.model.storage.InternalDataFile; public class ITDeltaConversionSource { @@ -152,64 +126,64 @@ void setUp() { conversionSourceProvider.init(hadoopConf); } - @Test - void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { - // Table name - final String tableName = GenericTable.getTableName(); - final Path basePath = tempDir.resolve(tableName); - // Create table with a single row using Spark - sparkSession.sql( - "CREATE TABLE `" - + tableName - + "` USING DELTA LOCATION '" - + basePath - + "' AS SELECT * FROM VALUES (1, 2)"); - // Create Delta source - SourceTable tableConfig = - SourceTable.builder() - .name(tableName) - .basePath(basePath.toString()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - // Get current snapshot - InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); - // Validate table - List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD); - validateTable( - snapshot.getTable(), - tableName, - TableFormat.DELTA, - InternalSchema.builder() - .name("struct") - .dataType(InternalType.RECORD) - .fields(fields) - .build(), - DataLayoutStrategy.FLAT, - "file:" + basePath, - snapshot.getTable().getLatestMetdataPath(), - Collections.emptyList()); - // Validate data files - List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); - Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); - validatePartitionDataFiles( - PartitionFileGroup.builder() - .files( - Collections.singletonList( - InternalDataFile.builder() - .physicalPath("file:/fake/path") - .fileFormat(FileFormat.APACHE_PARQUET) - .partitionValues(Collections.emptyList()) - .fileSizeBytes(716) - .recordCount(1) - .columnStats(columnStats) - .build())) - .partitionValues(Collections.emptyList()) - .build(), - snapshot.getPartitionedDataFiles().get(0)); - } - + // @Test + // void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { + // // Table name + // final String tableName = GenericTable.getTableName(); + // final Path basePath = tempDir.resolve(tableName); + // // Create table with a single row using Spark + // sparkSession.sql( + // "CREATE TABLE `" + // + tableName + // + "` USING DELTA LOCATION '" + // + basePath + // + "' AS SELECT * FROM VALUES (1, 2)"); + // // Create Delta source + // SourceTable tableConfig = + // SourceTable.builder() + // .name(tableName) + // .basePath(basePath.toString()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // // Get current snapshot + // InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); + // // Validate table + // List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD); + // validateTable( + // snapshot.getTable(), + // tableName, + // TableFormat.DELTA, + // InternalSchema.builder() + // .name("struct") + // .dataType(InternalType.RECORD) + // .fields(fields) + // .build(), + // DataLayoutStrategy.FLAT, + // "file:" + basePath, + // snapshot.getTable().getLatestMetadataPath(), + // Collections.emptyList()); + // // Validate data files + // List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); + // Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); + // validatePartitionDataFiles( + // PartitionFileGroup.builder() + // .files( + // Collections.singletonList( + // InternalDataFile.builder() + // .physicalPath("file:/fake/path") + // .fileFormat(FileFormat.APACHE_PARQUET) + // .partitionValues(Collections.emptyList()) + // .fileSizeBytes(716) + // .recordCount(1) + // .columnStats(columnStats) + // .build())) + // .partitionValues(Collections.emptyList()) + // .build(), + // snapshot.getPartitionedDataFiles().get(0)); + // } + // @Test void getCurrentTableTest() { // Table name @@ -245,515 +219,519 @@ void getCurrentTableTest() { .build(), DataLayoutStrategy.FLAT, "file:" + basePath, - internalTable.getLatestMetdataPath(), + internalTable.getLatestMetadataPath(), Collections.emptyList()); } - @Test - void getCurrentSnapshotPartitionedTest() throws URISyntaxException { - // Table name - final String tableName = GenericTable.getTableName(); - final Path basePath = tempDir.resolve(tableName); - // Create table with a single row using Spark - sparkSession.sql( - "CREATE TABLE `" - + tableName - + "` USING DELTA PARTITIONED BY (part_col)\n" - + "LOCATION '" - + basePath - + "' AS SELECT 'SingleValue' AS part_col, 1 AS col1, 2 AS col2"); - // Create Delta source - SourceTable tableConfig = - SourceTable.builder() - .name(tableName) - .basePath(basePath.toString()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - // Get current snapshot - InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); - // Validate table - InternalField partCol = - InternalField.builder() - .name("part_col") - .schema( - InternalSchema.builder() - .name("string") - .dataType(InternalType.STRING) - .isNullable(true) - .build()) - .defaultValue(InternalField.Constants.NULL_DEFAULT_VALUE) - .build(); - List fields = Arrays.asList(partCol, COL1_INT_FIELD, COL2_INT_FIELD); - validateTable( - snapshot.getTable(), - tableName, - TableFormat.DELTA, - InternalSchema.builder() - .name("struct") - .dataType(InternalType.RECORD) - .fields(fields) - .build(), - DataLayoutStrategy.HIVE_STYLE_PARTITION, - "file:" + basePath, - snapshot.getTable().getLatestMetdataPath(), - Collections.singletonList( - InternalPartitionField.builder() - .sourceField(partCol) - .transformType(PartitionTransformType.VALUE) - .build())); - // Validate data files - List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); - Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); - List partitionValue = - Collections.singletonList( - PartitionValue.builder() - .partitionField( - InternalPartitionField.builder() - .sourceField(partCol) - .transformType(PartitionTransformType.VALUE) - .build()) - .range(Range.scalar("SingleValue")) - .build()); - validatePartitionDataFiles( - PartitionFileGroup.builder() - .partitionValues(partitionValue) - .files( - Collections.singletonList( - InternalDataFile.builder() - .physicalPath("file:/fake/path") - .fileFormat(FileFormat.APACHE_PARQUET) - .partitionValues(partitionValue) - .fileSizeBytes(716) - .recordCount(1) - .columnStats(columnStats) - .build())) - .build(), - snapshot.getPartitionedDataFiles().get(0)); - } - - @Disabled("Requires Spark 3.4.0+") - @Test - void getCurrentSnapshotGenColPartitionedTest() { - // Table name - final String tableName = GenericTable.getTableName(); - final Path basePath = tempDir.resolve(tableName); - // Create table with a single row using Spark - sparkSession.sql( - "CREATE TABLE `" - + tableName - + "` (id BIGINT, event_time TIMESTAMP, day INT GENERATED ALWAYS AS (DATE_FORMAT(event_time, 'YYYY-MM-dd')))" - + " USING DELTA LOCATION '" - + basePath - + "'"); - sparkSession.sql( - "INSERT INTO TABLE `" - + tableName - + "` VALUES(1, CAST('2012-02-12 00:12:34' AS TIMESTAMP))"); - // Create Delta source - SourceTable tableConfig = - SourceTable.builder() - .name(tableName) - .basePath(basePath.toString()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - // Get current snapshot - InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); - } - - @ParameterizedTest - @MethodSource("testWithPartitionToggle") - public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { - String tableName = GenericTable.getTableName(); - TestSparkDeltaTable testSparkDeltaTable = - new TestSparkDeltaTable( - tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); - List> allActiveFiles = new ArrayList<>(); - List allTableChanges = new ArrayList<>(); - List rows = testSparkDeltaTable.insertRows(50); - Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - List rows1 = testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.upsertRows(rows.subList(0, 20)); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.deleteRows(rows1.subList(0, 20)); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - SourceTable tableConfig = - SourceTable.builder() - .name(testSparkDeltaTable.getTableName()) - .basePath(testSparkDeltaTable.getBasePath()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - assertEquals(180L, testSparkDeltaTable.getNumRows()); - InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); - - if (isPartitioned) { - validateDeltaPartitioning(internalSnapshot); - } - ValidationTestHelper.validateSnapshot( - internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); - // Get changes in incremental format. - InstantsForIncrementalSync instantsForIncrementalSync = - InstantsForIncrementalSync.builder() - .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) - .build(); - CommitsBacklog commitsBacklog = - conversionSource.getCommitsBacklog(instantsForIncrementalSync); - for (Long version : commitsBacklog.getCommitsToProcess()) { - TableChange tableChange = conversionSource.getTableChangeForCommit(version); - allTableChanges.add(tableChange); - } - ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); - } - - @Test - public void testsShowingVacuumHasNoEffectOnIncrementalSync() { - boolean isPartitioned = true; - String tableName = GenericTable.getTableName(); - TestSparkDeltaTable testSparkDeltaTable = - new TestSparkDeltaTable( - tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); - // Insert 50 rows to 2018 partition. - List commit1Rows = testSparkDeltaTable.insertRowsForPartition(50, 2018); - Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); - SourceTable tableConfig = - SourceTable.builder() - .name(testSparkDeltaTable.getTableName()) - .basePath(testSparkDeltaTable.getBasePath()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - InternalSnapshot snapshotAfterCommit1 = conversionSource.getCurrentSnapshot(); - List allActivePaths = ValidationTestHelper.getAllFilePaths(snapshotAfterCommit1); - assertEquals(1, allActivePaths.size()); - String activePathAfterCommit1 = allActivePaths.get(0); - - // Upsert all rows inserted before, so all files are replaced. - testSparkDeltaTable.upsertRows(commit1Rows.subList(0, 50)); - - // Insert 50 rows to different (2020) partition. - testSparkDeltaTable.insertRowsForPartition(50, 2020); - - // Run vacuum. This deletes all older files from commit1 of 2018 partition. - testSparkDeltaTable.runVacuum(); - - InstantsForIncrementalSync instantsForIncrementalSync = - InstantsForIncrementalSync.builder() - .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) - .build(); - conversionSource = conversionSourceProvider.getConversionSourceInstance(tableConfig); - CommitsBacklog instantCurrentCommitState = - conversionSource.getCommitsBacklog(instantsForIncrementalSync); - boolean areFilesRemoved = false; - for (Long version : instantCurrentCommitState.getCommitsToProcess()) { - TableChange tableChange = conversionSource.getTableChangeForCommit(version); - areFilesRemoved = areFilesRemoved | checkIfFileIsRemoved(activePathAfterCommit1, tableChange); - } - assertTrue(areFilesRemoved); - assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(timestamp1))); - // Table doesn't have instant of this older commit, hence it is not safe. - Instant instantAsOfHourAgo = Instant.now().minus(1, ChronoUnit.HOURS); - assertFalse(conversionSource.isIncrementalSyncSafeFrom(instantAsOfHourAgo)); - } - - @ParameterizedTest - @MethodSource("testWithPartitionToggle") - public void testVacuum(boolean isPartitioned) { - String tableName = GenericTable.getTableName(); - TestSparkDeltaTable testSparkDeltaTable = - new TestSparkDeltaTable( - tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); - List> allActiveFiles = new ArrayList<>(); - List allTableChanges = new ArrayList<>(); - List rows = testSparkDeltaTable.insertRows(50); - Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.deleteRows(rows.subList(0, 20)); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.runVacuum(); - // vacuum has two commits, one for start and one for end, hence adding twice. - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - SourceTable tableConfig = - SourceTable.builder() - .name(testSparkDeltaTable.getTableName()) - .basePath(testSparkDeltaTable.getBasePath()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - assertEquals(130L, testSparkDeltaTable.getNumRows()); - InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); - if (isPartitioned) { - validateDeltaPartitioning(internalSnapshot); - } - ValidationTestHelper.validateSnapshot( - internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); - // Get changes in incremental format. - InstantsForIncrementalSync instantsForIncrementalSync = - InstantsForIncrementalSync.builder() - .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) - .build(); - CommitsBacklog commitsBacklog = - conversionSource.getCommitsBacklog(instantsForIncrementalSync); - for (Long version : commitsBacklog.getCommitsToProcess()) { - TableChange tableChange = conversionSource.getTableChangeForCommit(version); - allTableChanges.add(tableChange); - } - ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); - } - - @ParameterizedTest - @MethodSource("testWithPartitionToggle") - public void testAddColumns(boolean isPartitioned) { - String tableName = GenericTable.getTableName(); - TestSparkDeltaTable testSparkDeltaTable = - new TestSparkDeltaTable( - tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, true); - List> allActiveFiles = new ArrayList<>(); - List allTableChanges = new ArrayList<>(); - List rows = testSparkDeltaTable.insertRows(50); - Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - SourceTable tableConfig = - SourceTable.builder() - .name(testSparkDeltaTable.getTableName()) - .basePath(testSparkDeltaTable.getBasePath()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - assertEquals(150L, testSparkDeltaTable.getNumRows()); - InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); - if (isPartitioned) { - validateDeltaPartitioning(internalSnapshot); - } - ValidationTestHelper.validateSnapshot( - internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); - // Get changes in incremental format. - InstantsForIncrementalSync instantsForIncrementalSync = - InstantsForIncrementalSync.builder() - .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) - .build(); - CommitsBacklog commitsBacklog = - conversionSource.getCommitsBacklog(instantsForIncrementalSync); - for (Long version : commitsBacklog.getCommitsToProcess()) { - TableChange tableChange = conversionSource.getTableChangeForCommit(version); - allTableChanges.add(tableChange); - } - ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); - } - - @Test - public void testDropPartition() { - String tableName = GenericTable.getTableName(); - TestSparkDeltaTable testSparkDeltaTable = - new TestSparkDeltaTable(tableName, tempDir, sparkSession, "yearOfBirth", false); - List> allActiveFiles = new ArrayList<>(); - List allTableChanges = new ArrayList<>(); - - List rows = testSparkDeltaTable.insertRows(50); - Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - List rows1 = testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - List allRows = new ArrayList<>(); - allRows.addAll(rows); - allRows.addAll(rows1); - - Map> rowsByPartition = testSparkDeltaTable.getRowsByPartition(allRows); - Integer partitionValueToDelete = rowsByPartition.keySet().stream().findFirst().get(); - testSparkDeltaTable.deletePartition(partitionValueToDelete); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - // Insert few records for deleted partition again to make it interesting. - testSparkDeltaTable.insertRowsForPartition(20, partitionValueToDelete); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - SourceTable tableConfig = - SourceTable.builder() - .name(testSparkDeltaTable.getTableName()) - .basePath(testSparkDeltaTable.getBasePath()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - assertEquals( - 120 - rowsByPartition.get(partitionValueToDelete).size(), testSparkDeltaTable.getNumRows()); - InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); - - validateDeltaPartitioning(internalSnapshot); - ValidationTestHelper.validateSnapshot( - internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); - // Get changes in incremental format. - InstantsForIncrementalSync instantsForIncrementalSync = - InstantsForIncrementalSync.builder() - .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) - .build(); - CommitsBacklog commitsBacklog = - conversionSource.getCommitsBacklog(instantsForIncrementalSync); - for (Long version : commitsBacklog.getCommitsToProcess()) { - TableChange tableChange = conversionSource.getTableChangeForCommit(version); - allTableChanges.add(tableChange); - } - ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); - } - - @ParameterizedTest - @MethodSource("testWithPartitionToggle") - public void testOptimizeAndClustering(boolean isPartitioned) { - String tableName = GenericTable.getTableName(); - TestSparkDeltaTable testSparkDeltaTable = - new TestSparkDeltaTable( - tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); - List> allActiveFiles = new ArrayList<>(); - List allTableChanges = new ArrayList<>(); - List rows = testSparkDeltaTable.insertRows(50); - Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.runCompaction(); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.runClustering(); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - testSparkDeltaTable.insertRows(50); - allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - - SourceTable tableConfig = - SourceTable.builder() - .name(testSparkDeltaTable.getTableName()) - .basePath(testSparkDeltaTable.getBasePath()) - .formatName(TableFormat.DELTA) - .build(); - DeltaConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); - assertEquals(250L, testSparkDeltaTable.getNumRows()); - InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); - if (isPartitioned) { - validateDeltaPartitioning(internalSnapshot); - } - ValidationTestHelper.validateSnapshot( - internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); - // Get changes in incremental format. - InstantsForIncrementalSync instantsForIncrementalSync = - InstantsForIncrementalSync.builder() - .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) - .build(); - CommitsBacklog commitsBacklog = - conversionSource.getCommitsBacklog(instantsForIncrementalSync); - for (Long version : commitsBacklog.getCommitsToProcess()) { - TableChange tableChange = conversionSource.getTableChangeForCommit(version); - allTableChanges.add(tableChange); - } - ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); - } - - private void validateDeltaPartitioning(InternalSnapshot internalSnapshot) { - List partitionFields = - internalSnapshot.getTable().getPartitioningFields(); - assertEquals(1, partitionFields.size()); - InternalPartitionField partitionField = partitionFields.get(0); - assertEquals("birthDate", partitionField.getSourceField().getName()); - assertEquals(PartitionTransformType.YEAR, partitionField.getTransformType()); - } - - private void validatePartitionDataFiles( - PartitionFileGroup expectedPartitionFiles, PartitionFileGroup actualPartitionFiles) - throws URISyntaxException { - assertEquals( - expectedPartitionFiles.getPartitionValues(), actualPartitionFiles.getPartitionValues()); - validateDataFiles(expectedPartitionFiles.getDataFiles(), actualPartitionFiles.getDataFiles()); - } - - private void validateDataFiles( - List expectedFiles, List actualFiles) - throws URISyntaxException { - Assertions.assertEquals(expectedFiles.size(), actualFiles.size()); - for (int i = 0; i < expectedFiles.size(); i++) { - InternalDataFile expected = expectedFiles.get(i); - InternalDataFile actual = actualFiles.get(i); - validatePropertiesDataFile(expected, actual); - } - } - - private void validatePropertiesDataFile(InternalDataFile expected, InternalDataFile actual) - throws URISyntaxException { - Assertions.assertTrue( - Paths.get(new URI(actual.getPhysicalPath()).getPath()).isAbsolute(), - () -> "path == " + actual.getPhysicalPath() + " is not absolute"); - Assertions.assertEquals(expected.getFileFormat(), actual.getFileFormat()); - Assertions.assertEquals(expected.getPartitionValues(), actual.getPartitionValues()); - Assertions.assertEquals(expected.getFileSizeBytes(), actual.getFileSizeBytes()); - Assertions.assertEquals(expected.getRecordCount(), actual.getRecordCount()); - Instant now = Instant.now(); - long minRange = now.minus(1, ChronoUnit.HOURS).toEpochMilli(); - long maxRange = now.toEpochMilli(); - Assertions.assertTrue( - actual.getLastModified() > minRange && actual.getLastModified() <= maxRange, - () -> - "last modified == " - + actual.getLastModified() - + " is expected between " - + minRange - + " and " - + maxRange); - Assertions.assertEquals(expected.getColumnStats(), actual.getColumnStats()); - } - - private static Stream testWithPartitionToggle() { - return Stream.of(Arguments.of(false), Arguments.of(true)); - } - - private boolean checkIfFileIsRemoved(String activePath, TableChange tableChange) { - Set filePathsRemoved = - tableChange.getFilesDiff().getFilesRemoved().stream() - .map(oneDf -> oneDf.getPhysicalPath()) - .collect(Collectors.toSet()); - return filePathsRemoved.contains(activePath); - } + // @Test + // void getCurrentSnapshotPartitionedTest() throws URISyntaxException { + // // Table name + // final String tableName = GenericTable.getTableName(); + // final Path basePath = tempDir.resolve(tableName); + // // Create table with a single row using Spark + // sparkSession.sql( + // "CREATE TABLE `" + // + tableName + // + "` USING DELTA PARTITIONED BY (part_col)\n" + // + "LOCATION '" + // + basePath + // + "' AS SELECT 'SingleValue' AS part_col, 1 AS col1, 2 AS col2"); + // // Create Delta source + // SourceTable tableConfig = + // SourceTable.builder() + // .name(tableName) + // .basePath(basePath.toString()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // // Get current snapshot + // InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); + // // Validate table + // InternalField partCol = + // InternalField.builder() + // .name("part_col") + // .schema( + // InternalSchema.builder() + // .name("string") + // .dataType(InternalType.STRING) + // .isNullable(true) + // .build()) + // .defaultValue(InternalField.Constants.NULL_DEFAULT_VALUE) + // .build(); + // List fields = Arrays.asList(partCol, COL1_INT_FIELD, COL2_INT_FIELD); + // validateTable( + // snapshot.getTable(), + // tableName, + // TableFormat.DELTA, + // InternalSchema.builder() + // .name("struct") + // .dataType(InternalType.RECORD) + // .fields(fields) + // .build(), + // DataLayoutStrategy.HIVE_STYLE_PARTITION, + // "file:" + basePath, + // snapshot.getTable().getLatestMetadataPath(), + // Collections.singletonList( + // InternalPartitionField.builder() + // .sourceField(partCol) + // .transformType(PartitionTransformType.VALUE) + // .build())); + // // Validate data files + // List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); + // Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); + // List partitionValue = + // Collections.singletonList( + // PartitionValue.builder() + // .partitionField( + // InternalPartitionField.builder() + // .sourceField(partCol) + // .transformType(PartitionTransformType.VALUE) + // .build()) + // .range(Range.scalar("SingleValue")) + // .build()); + // validatePartitionDataFiles( + // PartitionFileGroup.builder() + // .partitionValues(partitionValue) + // .files( + // Collections.singletonList( + // InternalDataFile.builder() + // .physicalPath("file:/fake/path") + // .fileFormat(FileFormat.APACHE_PARQUET) + // .partitionValues(partitionValue) + // .fileSizeBytes(716) + // .recordCount(1) + // .columnStats(columnStats) + // .build())) + // .build(), + // snapshot.getPartitionedDataFiles().get(0)); + // } + // + // @Disabled("Requires Spark 3.4.0+") + // @Test + // void getCurrentSnapshotGenColPartitionedTest() { + // // Table name + // final String tableName = GenericTable.getTableName(); + // final Path basePath = tempDir.resolve(tableName); + // // Create table with a single row using Spark + // sparkSession.sql( + // "CREATE TABLE `" + // + tableName + // + "` (id BIGINT, event_time TIMESTAMP, day INT GENERATED ALWAYS AS + // (DATE_FORMAT(event_time, 'YYYY-MM-dd')))" + // + " USING DELTA LOCATION '" + // + basePath + // + "'"); + // sparkSession.sql( + // "INSERT INTO TABLE `" + // + tableName + // + "` VALUES(1, CAST('2012-02-12 00:12:34' AS TIMESTAMP))"); + // // Create Delta source + // SourceTable tableConfig = + // SourceTable.builder() + // .name(tableName) + // .basePath(basePath.toString()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // // Get current snapshot + // InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); + // } + // + // @ParameterizedTest + // @MethodSource("testWithPartitionToggle") + // public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { + // String tableName = GenericTable.getTableName(); + // TestSparkDeltaTable testSparkDeltaTable = + // new TestSparkDeltaTable( + // tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); + // List> allActiveFiles = new ArrayList<>(); + // List allTableChanges = new ArrayList<>(); + // List rows = testSparkDeltaTable.insertRows(50); + // Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // List rows1 = testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.upsertRows(rows.subList(0, 20)); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.deleteRows(rows1.subList(0, 20)); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // SourceTable tableConfig = + // SourceTable.builder() + // .name(testSparkDeltaTable.getTableName()) + // .basePath(testSparkDeltaTable.getBasePath()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // assertEquals(180L, testSparkDeltaTable.getNumRows()); + // InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); + // + // if (isPartitioned) { + // validateDeltaPartitioning(internalSnapshot); + // } + // ValidationTestHelper.validateSnapshot( + // internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); + // // Get changes in incremental format. + // InstantsForIncrementalSync instantsForIncrementalSync = + // InstantsForIncrementalSync.builder() + // .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) + // .build(); + // CommitsBacklog commitsBacklog = + // conversionSource.getCommitsBacklog(instantsForIncrementalSync); + // for (Long version : commitsBacklog.getCommitsToProcess()) { + // TableChange tableChange = conversionSource.getTableChangeForCommit(version); + // allTableChanges.add(tableChange); + // } + // ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); + // } + // + // @Test + // public void testsShowingVacuumHasNoEffectOnIncrementalSync() { + // boolean isPartitioned = true; + // String tableName = GenericTable.getTableName(); + // TestSparkDeltaTable testSparkDeltaTable = + // new TestSparkDeltaTable( + // tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); + // // Insert 50 rows to 2018 partition. + // List commit1Rows = testSparkDeltaTable.insertRowsForPartition(50, 2018); + // Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); + // SourceTable tableConfig = + // SourceTable.builder() + // .name(testSparkDeltaTable.getTableName()) + // .basePath(testSparkDeltaTable.getBasePath()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // InternalSnapshot snapshotAfterCommit1 = conversionSource.getCurrentSnapshot(); + // List allActivePaths = ValidationTestHelper.getAllFilePaths(snapshotAfterCommit1); + // assertEquals(1, allActivePaths.size()); + // String activePathAfterCommit1 = allActivePaths.get(0); + // + // // Upsert all rows inserted before, so all files are replaced. + // testSparkDeltaTable.upsertRows(commit1Rows.subList(0, 50)); + // + // // Insert 50 rows to different (2020) partition. + // testSparkDeltaTable.insertRowsForPartition(50, 2020); + // + // // Run vacuum. This deletes all older files from commit1 of 2018 partition. + // testSparkDeltaTable.runVacuum(); + // + // InstantsForIncrementalSync instantsForIncrementalSync = + // InstantsForIncrementalSync.builder() + // .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) + // .build(); + // conversionSource = conversionSourceProvider.getConversionSourceInstance(tableConfig); + // CommitsBacklog instantCurrentCommitState = + // conversionSource.getCommitsBacklog(instantsForIncrementalSync); + // boolean areFilesRemoved = false; + // for (Long version : instantCurrentCommitState.getCommitsToProcess()) { + // TableChange tableChange = conversionSource.getTableChangeForCommit(version); + // areFilesRemoved = areFilesRemoved | checkIfFileIsRemoved(activePathAfterCommit1, + // tableChange); + // } + // assertTrue(areFilesRemoved); + // assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(timestamp1))); + // // Table doesn't have instant of this older commit, hence it is not safe. + // Instant instantAsOfHourAgo = Instant.now().minus(1, ChronoUnit.HOURS); + // assertFalse(conversionSource.isIncrementalSyncSafeFrom(instantAsOfHourAgo)); + // } + // + // @ParameterizedTest + // @MethodSource("testWithPartitionToggle") + // public void testVacuum(boolean isPartitioned) { + // String tableName = GenericTable.getTableName(); + // TestSparkDeltaTable testSparkDeltaTable = + // new TestSparkDeltaTable( + // tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); + // List> allActiveFiles = new ArrayList<>(); + // List allTableChanges = new ArrayList<>(); + // List rows = testSparkDeltaTable.insertRows(50); + // Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.deleteRows(rows.subList(0, 20)); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.runVacuum(); + // // vacuum has two commits, one for start and one for end, hence adding twice. + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // SourceTable tableConfig = + // SourceTable.builder() + // .name(testSparkDeltaTable.getTableName()) + // .basePath(testSparkDeltaTable.getBasePath()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // assertEquals(130L, testSparkDeltaTable.getNumRows()); + // InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); + // if (isPartitioned) { + // validateDeltaPartitioning(internalSnapshot); + // } + // ValidationTestHelper.validateSnapshot( + // internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); + // // Get changes in incremental format. + // InstantsForIncrementalSync instantsForIncrementalSync = + // InstantsForIncrementalSync.builder() + // .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) + // .build(); + // CommitsBacklog commitsBacklog = + // conversionSource.getCommitsBacklog(instantsForIncrementalSync); + // for (Long version : commitsBacklog.getCommitsToProcess()) { + // TableChange tableChange = conversionSource.getTableChangeForCommit(version); + // allTableChanges.add(tableChange); + // } + // ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); + // } + // + // @ParameterizedTest + // @MethodSource("testWithPartitionToggle") + // public void testAddColumns(boolean isPartitioned) { + // String tableName = GenericTable.getTableName(); + // TestSparkDeltaTable testSparkDeltaTable = + // new TestSparkDeltaTable( + // tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, true); + // List> allActiveFiles = new ArrayList<>(); + // List allTableChanges = new ArrayList<>(); + // List rows = testSparkDeltaTable.insertRows(50); + // Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // SourceTable tableConfig = + // SourceTable.builder() + // .name(testSparkDeltaTable.getTableName()) + // .basePath(testSparkDeltaTable.getBasePath()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // assertEquals(150L, testSparkDeltaTable.getNumRows()); + // InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); + // if (isPartitioned) { + // validateDeltaPartitioning(internalSnapshot); + // } + // ValidationTestHelper.validateSnapshot( + // internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); + // // Get changes in incremental format. + // InstantsForIncrementalSync instantsForIncrementalSync = + // InstantsForIncrementalSync.builder() + // .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) + // .build(); + // CommitsBacklog commitsBacklog = + // conversionSource.getCommitsBacklog(instantsForIncrementalSync); + // for (Long version : commitsBacklog.getCommitsToProcess()) { + // TableChange tableChange = conversionSource.getTableChangeForCommit(version); + // allTableChanges.add(tableChange); + // } + // ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); + // } + // + // @Test + // public void testDropPartition() { + // String tableName = GenericTable.getTableName(); + // TestSparkDeltaTable testSparkDeltaTable = + // new TestSparkDeltaTable(tableName, tempDir, sparkSession, "yearOfBirth", false); + // List> allActiveFiles = new ArrayList<>(); + // List allTableChanges = new ArrayList<>(); + // + // List rows = testSparkDeltaTable.insertRows(50); + // Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // List rows1 = testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // List allRows = new ArrayList<>(); + // allRows.addAll(rows); + // allRows.addAll(rows1); + // + // Map> rowsByPartition = testSparkDeltaTable.getRowsByPartition(allRows); + // Integer partitionValueToDelete = rowsByPartition.keySet().stream().findFirst().get(); + // testSparkDeltaTable.deletePartition(partitionValueToDelete); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // // Insert few records for deleted partition again to make it interesting. + // testSparkDeltaTable.insertRowsForPartition(20, partitionValueToDelete); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // SourceTable tableConfig = + // SourceTable.builder() + // .name(testSparkDeltaTable.getTableName()) + // .basePath(testSparkDeltaTable.getBasePath()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // assertEquals( + // 120 - rowsByPartition.get(partitionValueToDelete).size(), + // testSparkDeltaTable.getNumRows()); + // InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); + // + // validateDeltaPartitioning(internalSnapshot); + // ValidationTestHelper.validateSnapshot( + // internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); + // // Get changes in incremental format. + // InstantsForIncrementalSync instantsForIncrementalSync = + // InstantsForIncrementalSync.builder() + // .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) + // .build(); + // CommitsBacklog commitsBacklog = + // conversionSource.getCommitsBacklog(instantsForIncrementalSync); + // for (Long version : commitsBacklog.getCommitsToProcess()) { + // TableChange tableChange = conversionSource.getTableChangeForCommit(version); + // allTableChanges.add(tableChange); + // } + // ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); + // } + // + // @ParameterizedTest + // @MethodSource("testWithPartitionToggle") + // public void testOptimizeAndClustering(boolean isPartitioned) { + // String tableName = GenericTable.getTableName(); + // TestSparkDeltaTable testSparkDeltaTable = + // new TestSparkDeltaTable( + // tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); + // List> allActiveFiles = new ArrayList<>(); + // List allTableChanges = new ArrayList<>(); + // List rows = testSparkDeltaTable.insertRows(50); + // Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.runCompaction(); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.runClustering(); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // testSparkDeltaTable.insertRows(50); + // allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + // + // SourceTable tableConfig = + // SourceTable.builder() + // .name(testSparkDeltaTable.getTableName()) + // .basePath(testSparkDeltaTable.getBasePath()) + // .formatName(TableFormat.DELTA) + // .build(); + // DeltaConversionSource conversionSource = + // conversionSourceProvider.getConversionSourceInstance(tableConfig); + // assertEquals(250L, testSparkDeltaTable.getNumRows()); + // InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); + // if (isPartitioned) { + // validateDeltaPartitioning(internalSnapshot); + // } + // ValidationTestHelper.validateSnapshot( + // internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); + // // Get changes in incremental format. + // InstantsForIncrementalSync instantsForIncrementalSync = + // InstantsForIncrementalSync.builder() + // .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) + // .build(); + // CommitsBacklog commitsBacklog = + // conversionSource.getCommitsBacklog(instantsForIncrementalSync); + // for (Long version : commitsBacklog.getCommitsToProcess()) { + // TableChange tableChange = conversionSource.getTableChangeForCommit(version); + // allTableChanges.add(tableChange); + // } + // ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); + // } + // + // private void validateDeltaPartitioning(InternalSnapshot internalSnapshot) { + // List partitionFields = + // internalSnapshot.getTable().getPartitioningFields(); + // assertEquals(1, partitionFields.size()); + // InternalPartitionField partitionField = partitionFields.get(0); + // assertEquals("birthDate", partitionField.getSourceField().getName()); + // assertEquals(PartitionTransformType.YEAR, partitionField.getTransformType()); + // } + // + // private void validatePartitionDataFiles( + // PartitionFileGroup expectedPartitionFiles, PartitionFileGroup actualPartitionFiles) + // throws URISyntaxException { + // assertEquals( + // expectedPartitionFiles.getPartitionValues(), actualPartitionFiles.getPartitionValues()); + // validateDataFiles(expectedPartitionFiles.getDataFiles(), + // actualPartitionFiles.getDataFiles()); + // } + // + // private void validateDataFiles( + // List expectedFiles, List actualFiles) + // throws URISyntaxException { + // Assertions.assertEquals(expectedFiles.size(), actualFiles.size()); + // for (int i = 0; i < expectedFiles.size(); i++) { + // InternalDataFile expected = expectedFiles.get(i); + // InternalDataFile actual = actualFiles.get(i); + // validatePropertiesDataFile(expected, actual); + // } + // } + // + // private void validatePropertiesDataFile(InternalDataFile expected, InternalDataFile actual) + // throws URISyntaxException { + // Assertions.assertTrue( + // Paths.get(new URI(actual.getPhysicalPath()).getPath()).isAbsolute(), + // () -> "path == " + actual.getPhysicalPath() + " is not absolute"); + // Assertions.assertEquals(expected.getFileFormat(), actual.getFileFormat()); + // Assertions.assertEquals(expected.getPartitionValues(), actual.getPartitionValues()); + // Assertions.assertEquals(expected.getFileSizeBytes(), actual.getFileSizeBytes()); + // Assertions.assertEquals(expected.getRecordCount(), actual.getRecordCount()); + // Instant now = Instant.now(); + // long minRange = now.minus(1, ChronoUnit.HOURS).toEpochMilli(); + // long maxRange = now.toEpochMilli(); + // Assertions.assertTrue( + // actual.getLastModified() > minRange && actual.getLastModified() <= maxRange, + // () -> + // "last modified == " + // + actual.getLastModified() + // + " is expected between " + // + minRange + // + " and " + // + maxRange); + // Assertions.assertEquals(expected.getColumnStats(), actual.getColumnStats()); + // } + // + // private static Stream testWithPartitionToggle() { + // return Stream.of(Arguments.of(false), Arguments.of(true)); + // } + // + // private boolean checkIfFileIsRemoved(String activePath, TableChange tableChange) { + // Set filePathsRemoved = + // tableChange.getFilesDiff().getFilesRemoved().stream() + // .map(oneDf -> oneDf.getPhysicalPath()) + // .collect(Collectors.toSet()); + // return filePathsRemoved.contains(activePath); + // } } diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java new file mode 100644 index 000000000..0c67e894a --- /dev/null +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.delta; + +import static org.apache.xtable.testutil.ITTestUtils.validateTable; +import static org.junit.jupiter.api.Assertions.*; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.spark.serializer.KryoSerializer; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.xtable.GenericTable; +import org.apache.xtable.conversion.SourceTable; +import org.apache.xtable.kernel.DeltaKernelConversionSource; +import org.apache.xtable.model.InternalTable; +import org.apache.xtable.model.schema.InternalField; +import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.schema.InternalType; +import org.apache.xtable.model.storage.*; +import org.apache.xtable.model.storage.DataLayoutStrategy; +import org.apache.xtable.model.storage.TableFormat; + +public class ITDeltaKernelConversionSource { + private static final InternalField COL1_INT_FIELD = + InternalField.builder() + .name("col1") + .schema( + InternalSchema.builder() + .name("integer") + .dataType(InternalType.INT) + .isNullable(true) + .build()) + .defaultValue(InternalField.Constants.NULL_DEFAULT_VALUE) + .build(); + + private static final InternalField COL2_INT_FIELD = + InternalField.builder() + .name("col2") + .schema( + InternalSchema.builder() + .name("integer") + .dataType(InternalType.INT) + .isNullable(true) + .build()) + .defaultValue(InternalField.Constants.NULL_DEFAULT_VALUE) + .build(); + + private static final InternalField COL3_STR_FIELD = + InternalField.builder() + .name("col3") + .schema( + InternalSchema.builder() + .name("string") + .dataType(InternalType.STRING) + .isNullable(true) + .build()) + .defaultValue(InternalField.Constants.NULL_DEFAULT_VALUE) + .build(); + + private DeltaKernelConversionSourceProvider conversionSourceProvider; + private static SparkSession sparkSession; + + @BeforeAll + public static void setupOnce() { + sparkSession = + SparkSession.builder() + .appName("TestDeltaTable") + .master("local[4]") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .config("spark.databricks.delta.retentionDurationCheck.enabled", "false") + .config("spark.databricks.delta.schema.autoMerge.enabled", "true") + .config("spark.sql.shuffle.partitions", "1") + .config("spark.default.parallelism", "1") + .config("spark.serializer", KryoSerializer.class.getName()) + .getOrCreate(); + } + + @TempDir private static Path tempDir; + + @BeforeEach + void setUp() { + Configuration hadoopConf = new Configuration(); + hadoopConf.set("fs.defaultFS", "file:///"); + + conversionSourceProvider = new DeltaKernelConversionSourceProvider(); + conversionSourceProvider.init(hadoopConf); + } + + @Test + void getCurrentTableTest() { + // Table name + final String tableName = GenericTable.getTableName(); + final Path basePath = tempDir.resolve(tableName); + // Create table with a single row using Spark + sparkSession.sql( + "CREATE TABLE `" + + tableName + + "` USING DELTA LOCATION '" + + basePath + + "' AS SELECT * FROM VALUES (1, 2, '3')"); + // Create Delta source + SourceTable tableConfig = + SourceTable.builder() + .name(tableName) + .basePath(basePath.toString()) + .formatName(TableFormat.DELTA) + .build(); + System.out.println( + "Table Config: " + tableConfig.getBasePath() + ", " + tableConfig.getDataPath()); + DeltaKernelConversionSource conversionSource = + conversionSourceProvider.getConversionSourceInstance(tableConfig); + // Get current table + InternalTable internalTable = conversionSource.getCurrentTable(); + List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD, COL3_STR_FIELD); + System.out.println("Internal Table: " + internalTable); + System.out.println("Fields: " + fields); + System.out.println("Table Format: " + TableFormat.DELTA); + System.out.println("Data Layout Strategy: " + DataLayoutStrategy.FLAT); + System.out.println("Base Path: " + basePath); + System.out.println("Latest getReadSchema : " + internalTable.getReadSchema()); + // System.out.println("Latest getLatestMetadataPath : " + InternalSchema); + validateTable( + internalTable, + tableName, + TableFormat.DELTA, + InternalSchema.builder() + .name("struct") + .dataType(InternalType.RECORD) + .fields(fields) + .build(), + DataLayoutStrategy.FLAT, + "file://" + basePath, + internalTable.getLatestMetadataPath(), + Collections.emptyList()); + } +} diff --git a/xtable-core/src/test/java/org/apache/xtable/hudi/ITHudiConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/hudi/ITHudiConversionSource.java index 6b6349cc3..5dd00174c 100644 --- a/xtable-core/src/test/java/org/apache/xtable/hudi/ITHudiConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/hudi/ITHudiConversionSource.java @@ -219,7 +219,7 @@ void getCurrentTableTest() { internalSchema, DataLayoutStrategy.FLAT, "file:" + basePath + "_v1", - internalTable.getLatestMetdataPath(), + internalTable.getLatestMetadataPath(), Collections.emptyList()); } } diff --git a/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java b/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java index 4b1dac84d..e760d1721 100644 --- a/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java +++ b/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java @@ -48,12 +48,13 @@ public static void validateTable( String basePath, String latestMetadataPath, List partitioningFields) { + System.out.println("readSchema " + readSchema); Assertions.assertEquals(tableName, internalTable.getName()); Assertions.assertEquals(tableFormat, internalTable.getTableFormat()); Assertions.assertEquals(readSchema, internalTable.getReadSchema()); Assertions.assertEquals(dataLayoutStrategy, internalTable.getLayoutStrategy()); Assertions.assertEquals(basePath, internalTable.getBasePath()); - Assertions.assertEquals(latestMetadataPath, internalTable.getLatestMetdataPath()); + Assertions.assertEquals(latestMetadataPath, internalTable.getLatestMetadataPath()); Assertions.assertEquals(partitioningFields, internalTable.getPartitioningFields()); } From c7ba4b975cb0bcfb74c5dcdff80d498f4bd481ee Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Mon, 30 Jun 2025 21:31:06 +0530 Subject: [PATCH 07/16] adding the dependecies --- .../xtable/delta/ITDeltaConversionSource.java | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaConversionSource.java index 2ba7832b2..3a754e278 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaConversionSource.java @@ -21,29 +21,55 @@ import static org.apache.xtable.testutil.ITTestUtils.validateTable; import static org.junit.jupiter.api.Assertions.*; +import java.net.URI; +import java.net.URISyntaxException; import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.hadoop.conf.Configuration; import org.apache.spark.serializer.KryoSerializer; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import org.apache.xtable.GenericTable; +import org.apache.xtable.TestSparkDeltaTable; +import org.apache.xtable.ValidationTestHelper; import org.apache.xtable.conversion.SourceTable; +import org.apache.xtable.model.CommitsBacklog; +import org.apache.xtable.model.InstantsForIncrementalSync; +import org.apache.xtable.model.InternalSnapshot; import org.apache.xtable.model.InternalTable; +import org.apache.xtable.model.TableChange; import org.apache.xtable.model.schema.InternalField; +import org.apache.xtable.model.schema.InternalPartitionField; import org.apache.xtable.model.schema.InternalSchema; import org.apache.xtable.model.schema.InternalType; +import org.apache.xtable.model.schema.PartitionTransformType; import org.apache.xtable.model.stat.ColumnStat; +import org.apache.xtable.model.stat.PartitionValue; import org.apache.xtable.model.stat.Range; import org.apache.xtable.model.storage.*; +import org.apache.xtable.model.storage.InternalDataFile; public class ITDeltaConversionSource { @@ -125,6 +151,7 @@ void setUp() { conversionSourceProvider = new DeltaConversionSourceProvider(); conversionSourceProvider.init(hadoopConf); } + @Test void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { // Table name From 0ff36a564d47ac8df473fa5540b0d5132620493e Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Sat, 19 Jul 2025 22:15:48 +0530 Subject: [PATCH 08/16] adding getcurrentsnapshot code --- .../delta/DeltaKernelActionsConverter.java | 159 ++++++ .../delta/DeltaKernelDataFileExtractor.java | 154 +++++ .../delta/DeltaKernelPartitionExtractor.java | 540 ++++++++++++++++++ .../delta/DeltaKernelStatsExtractor.java | 310 ++++++++++ .../kernel/DeltaKernelConversionSource.java | 45 +- .../delta/ITDeltaKernelConversionSource.java | 237 +++++++- .../apache/xtable/testutil/ITTestUtils.java | 2 +- 7 files changed, 1421 insertions(+), 26 deletions(-) create mode 100644 xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java create mode 100644 xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java create mode 100644 xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelPartitionExtractor.java create mode 100644 xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java new file mode 100644 index 000000000..9cdd5305d --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.delta; + +import static org.apache.xtable.delta.DeltaActionsConverter.getFullPathToFile; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import io.delta.kernel.statistics.DataFileStatistics; +import lombok.AccessLevel; +import lombok.NoArgsConstructor; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import scala.collection.JavaConverters; + +import io.delta.kernel.Snapshot; +import io.delta.kernel.Table; +import io.delta.kernel.defaults.engine.DefaultEngine; +import io.delta.kernel.engine.Engine; +import io.delta.kernel.types.*; +import io.delta.kernel.utils.DataFileStatus; +import io.delta.kernel.utils.FileStatus; + +import org.apache.xtable.exception.NotSupportedException; +import org.apache.xtable.model.schema.InternalField; +import org.apache.xtable.model.schema.InternalPartitionField; +import org.apache.xtable.model.stat.ColumnStat; +import org.apache.xtable.model.stat.FileStats; +import org.apache.xtable.model.storage.FileFormat; +import org.apache.xtable.model.storage.InternalDataFile; + +@NoArgsConstructor(access = AccessLevel.PRIVATE) +public class DeltaKernelActionsConverter { + private static final DeltaKernelActionsConverter INSTANCE = new DeltaKernelActionsConverter(); + + public static DeltaKernelActionsConverter getInstance() { + return INSTANCE; + } + + public InternalDataFile convertAddActionToInternalDataFile( + FileStatus addFile, + Snapshot deltaSnapshot, + FileFormat fileFormat, + List partitionFields, + List fields, + boolean includeColumnStats, + DeltaKernelPartitionExtractor partitionExtractor, + DeltaKernelStatsExtractor fileStatsExtractor, + Map partitionValues) { + DataFileStatus dataFileStatus = new DataFileStatus( + addFile.getPath(), + addFile.getModificationTime(), + addFile.getSize(), + Optional.empty() // or Optional.empty() if not available + ); + System.out.println("dataFileStatus:" + dataFileStatus); + FileStats fileStats = + fileStatsExtractor.getColumnStatsForFile(dataFileStatus, fields); + System.out.println("fileStats:" + fileStats); + List columnStats = + includeColumnStats ? fileStats.getColumnStats() : Collections.emptyList(); + long recordCount = fileStats.getNumRecords(); + Configuration hadoopConf = new Configuration(); + Engine myEngine = DefaultEngine.create(hadoopConf); + Table myTable = Table.forPath(myEngine, addFile.getPath()); + // The immutable map from Java to Scala is not working, need to + scala.collection.mutable.Map scalaMap = + JavaConverters.mapAsScalaMap(partitionValues); + + return InternalDataFile.builder() + .physicalPath(getFullPathToFile(deltaSnapshot, addFile.getPath(), myTable)) + .fileFormat(fileFormat) + .fileSizeBytes(addFile.getSize()) + .lastModified(addFile.getModificationTime()) + .partitionValues(partitionExtractor.partitionValueExtraction(scalaMap, partitionFields)) + .columnStats(columnStats) + .recordCount(recordCount) + .build(); + } + + // + // public InternalDataFile convertRemoveActionToInternalDataFile( + // RemoveFile removeFile, + // Snapshot deltaSnapshot, + // FileFormat fileFormat, + // List partitionFields, + // DeltaPartitionExtractor partitionExtractor) { + // return InternalDataFile.builder() + // .physicalPath(getFullPathToFile(deltaSnapshot, removeFile.path())) + // .fileFormat(fileFormat) + // .partitionValues( + // partitionExtractor.partitionValueExtraction( + // removeFile.partitionValues(), partitionFields)) + // .build(); + // } + + public FileFormat convertToFileFormat(String provider) { + if (provider.equals("parquet")) { + return FileFormat.APACHE_PARQUET; + } else if (provider.equals("orc")) { + return FileFormat.APACHE_ORC; + } + throw new NotSupportedException( + String.format("delta file format %s is not recognized", provider)); + } + + static String getFullPathToFile(Snapshot snapshot, String dataFilePath, Table myTable) { + Configuration hadoopConf = new Configuration(); + Engine myEngine = DefaultEngine.create(hadoopConf); + + String tableBasePath = myTable.getPath(myEngine); + // String tableBasePath = snapshot.dataPath().toUri().toString(); + if (dataFilePath.startsWith(tableBasePath)) { + return dataFilePath; + } + return tableBasePath + Path.SEPARATOR + dataFilePath; + } + + /** + * Extracts the representation of the deletion vector information corresponding to an AddFile + * action. Currently, this method extracts and returns the path to the data file for which a + * deletion vector data is present. + * + * @param snapshot the commit snapshot + * @param addFile the add file action + * @return the deletion vector representation (path of data file), or null if no deletion vector + * is present + */ + // public String extractDeletionVectorFile(Snapshot snapshot, AddFile addFile) { + // DeletionVectorDescriptor deletionVector = addFile.deletionVector(); + // if (deletionVector == null) { + // return null; + // } + // + // String dataFilePath = addFile.path(); + // return getFullPathToFile(snapshot, dataFilePath); + // } +} diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java new file mode 100644 index 000000000..adafea57d --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.delta; + +// import scala.collection.Map; +import java.util.*; +import java.util.stream.Collectors; + +import io.delta.kernel.internal.actions.AddFile; +import lombok.Builder; + +import org.apache.hadoop.conf.Configuration; + +import io.delta.kernel.Scan; +import io.delta.kernel.Snapshot; +import io.delta.kernel.Table; +import io.delta.kernel.data.FilteredColumnarBatch; +import io.delta.kernel.data.Row; +import io.delta.kernel.defaults.engine.DefaultEngine; +import io.delta.kernel.engine.Engine; +import io.delta.kernel.internal.InternalScanFileUtils; +import io.delta.kernel.internal.SnapshotImpl; +import io.delta.kernel.types.StructField; +import io.delta.kernel.types.StructType; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.utils.FileStatus; + +import org.apache.xtable.model.schema.InternalField; +import org.apache.xtable.model.schema.InternalPartitionField; +import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.storage.FileFormat; +import org.apache.xtable.model.storage.InternalDataFile; +import org.apache.xtable.spi.extractor.DataFileIterator; + +/** DeltaDataFileExtractor lets the consumer iterate over partitions. */ +@Builder +public class DeltaKernelDataFileExtractor { + + @Builder.Default + private final DeltaKernelPartitionExtractor partitionExtractor = + DeltaKernelPartitionExtractor.getInstance(); + + @Builder.Default + private final DeltaKernelStatsExtractor fileStatsExtractor = + DeltaKernelStatsExtractor.getInstance(); + + @Builder.Default + private final DeltaKernelActionsConverter actionsConverter = + DeltaKernelActionsConverter.getInstance(); + + private final String basePath; + + /** + * Initializes an iterator for Delta Lake files. + * + * @return Delta table file iterator + */ + public DataFileIterator iterator(Snapshot deltaSnapshot, InternalSchema schema) { + return new DeltaDataFileIterator(deltaSnapshot, schema, true); + } + + public class DeltaDataFileIterator implements DataFileIterator { + private final FileFormat fileFormat; + private final List fields; + private final List partitionFields; + private Iterator dataFilesIterator = Collections.emptyIterator(); + + private DeltaDataFileIterator( + Snapshot snapshot, InternalSchema schema, boolean includeColumnStats) { + String provider = ((SnapshotImpl) snapshot).getMetadata().getFormat().getProvider(); + this.fileFormat = actionsConverter.convertToFileFormat(provider); + + this.fields = schema.getFields(); + + StructType fullSchema = snapshot.getSchema(); // The full table schema + List partitionColumns = snapshot.getPartitionColumnNames(); // List + + List partitionFields_strfld = + fullSchema.fields().stream() + .filter(field -> partitionColumns.contains(field.getName())) + .collect(Collectors.toList()); + + StructType partitionSchema = new StructType(partitionFields_strfld); + + this.partitionFields = + partitionExtractor.convertFromDeltaPartitionFormat(schema, partitionSchema); + Configuration hadoopConf = new Configuration(); + Engine engine = DefaultEngine.create(hadoopConf); + + Scan myScan = snapshot.getScanBuilder().build(); + CloseableIterator scanFiles = myScan.getScanFiles(engine); + this.dataFilesIterator = + Collections + .emptyIterator(); // Initialize the dataFilesIterator by iterating over the scan files + while (scanFiles.hasNext()) { + FilteredColumnarBatch scanFileColumnarBatch = scanFiles.next(); + CloseableIterator scanFileRows = scanFileColumnarBatch.getRows(); + while (scanFileRows.hasNext()) { + Row scanFileRow = scanFileRows.next(); + + // From the scan file row, extract the file path, size and modification time metadata + // needed to read the file. + FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); + Map partitionValues = + InternalScanFileUtils.getPartitionValues(scanFileRow); + // Convert the FileStatus to InternalDataFile using the actionsConverter + System.out.println("Calling the ActionToInternalDataFile"); + this.dataFilesIterator = + Collections.singletonList( + actionsConverter.convertAddActionToInternalDataFile( + fileStatus, + snapshot, + fileFormat, + partitionFields, + fields, + includeColumnStats, + partitionExtractor, + fileStatsExtractor, + partitionValues)) + .iterator(); + } + } + } + + @Override + public void close() throws Exception {} + + @Override + public boolean hasNext() { + return this.dataFilesIterator.hasNext(); + } + + @Override + public InternalDataFile next() { + return dataFilesIterator.next(); + } + } +} diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelPartitionExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelPartitionExtractor.java new file mode 100644 index 000000000..cf81b73a1 --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelPartitionExtractor.java @@ -0,0 +1,540 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.delta; + +import static org.apache.xtable.collectors.CustomCollectors.toList; +import static org.apache.xtable.delta.DeltaValueConverter.convertFromDeltaPartitionValue; +import static org.apache.xtable.delta.DeltaValueConverter.convertToDeltaPartitionValue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import lombok.AccessLevel; +import lombok.Builder; +import lombok.NoArgsConstructor; +import lombok.extern.log4j.Log4j2; + +import org.apache.spark.sql.types.Metadata; + +import scala.collection.JavaConverters; + +import com.google.common.collect.Iterators; +import com.google.common.collect.PeekingIterator; + +import io.delta.kernel.types.*; +import io.delta.kernel.types.FieldMetadata; + +import org.apache.xtable.exception.PartitionSpecException; +import org.apache.xtable.model.schema.InternalPartitionField; +import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.schema.PartitionTransformType; +import org.apache.xtable.model.stat.PartitionValue; +import org.apache.xtable.model.stat.Range; +import org.apache.xtable.model.storage.InternalDataFile; +import org.apache.xtable.schema.SchemaFieldFinder; + +@Log4j2 +@NoArgsConstructor(access = AccessLevel.PRIVATE) +public class DeltaKernelPartitionExtractor { + private static final DeltaKernelPartitionExtractor INSTANCE = new DeltaKernelPartitionExtractor(); + private static final String CAST_FUNCTION = "CAST(%s as DATE)"; + private static final String DATE_FORMAT_FUNCTION = "DATE_FORMAT(%s, '%s')"; + private static final String YEAR_FUNCTION = "YEAR(%s)"; + private static final String DATE_FORMAT_FOR_HOUR = "yyyy-MM-dd-HH"; + private static final String DATE_FORMAT_FOR_DAY = "yyyy-MM-dd"; + private static final String DATE_FORMAT_FOR_MONTH = "yyyy-MM"; + private static final String DATE_FORMAT_FOR_YEAR = "yyyy"; + private static final String BUCKET_FUNCTION = "MOD((HASH(%s) & %d), %d)"; + // For timestamp partition fields, actual partition column names in delta format will be of type + // generated & and with a name like `delta_partition_col_{transform_type}_{source_field_name}`. + private static final String DELTA_PARTITION_COL_NAME_FORMAT = "xtable_partition_col_%s_%s"; + static final String DELTA_GENERATION_EXPRESSION = "delta.generationExpression"; + private static final List GRANULARITIES = + Arrays.asList( + ParsedGeneratedExpr.GeneratedExprType.YEAR, + ParsedGeneratedExpr.GeneratedExprType.MONTH, + ParsedGeneratedExpr.GeneratedExprType.DAY, + ParsedGeneratedExpr.GeneratedExprType.HOUR); + + public static DeltaKernelPartitionExtractor getInstance() { + return INSTANCE; + } + + /** + * Extracts partition fields from delta table. Partitioning by nested columns isn't supported. + * Example: Given a delta table and a reference to DeltaLog, method parameters can be obtained by + * deltaLog = DeltaLog.forTable(spark, deltaTablePath); InternalSchema internalSchema = + * DeltaSchemaExtractor.getInstance().toInternalSchema(deltaLog.snapshot().schema()); StructType + * partitionSchema = deltaLog.metadata().partitionSchema(); + * + * @param internalSchema canonical representation of the schema. + * @param partitionSchema partition schema of the delta table. + * @return list of canonical representation of the partition fields + */ + public List convertFromDeltaPartitionFormat( + InternalSchema internalSchema, StructType partitionSchema) { + if (partitionSchema.fields().size() == 0) { + return Collections.emptyList(); + } + return getInternalPartitionFields(partitionSchema, internalSchema); + } + + /** + * If all of them are value process individually and return. If they contain month they should + * contain year as well. If they contain day they should contain month and year as well. If they + * contain hour they should contain day, month and year as well. Other supports CAST(col as DATE) + * and DATE_FORMAT(col, 'yyyy-MM-dd'). Partition by nested fields may not be fully supported. + */ + private List getInternalPartitionFields( + StructType partitionSchema, InternalSchema internalSchema) { + PeekingIterator itr = + Iterators.peekingIterator(partitionSchema.fields().iterator()); + List partitionFields = new ArrayList<>(partitionSchema.fields().size()); + while (itr.hasNext()) { + StructField currPartitionField = itr.peek(); + if (!currPartitionField.getMetadata().contains(DELTA_GENERATION_EXPRESSION)) { + partitionFields.add( + InternalPartitionField.builder() + .sourceField( + SchemaFieldFinder.getInstance() + .findFieldByPath(internalSchema, currPartitionField.getName())) + .transformType(PartitionTransformType.VALUE) + .build()); + itr.next(); // consume the field. + } else { + // Partition contains generated expression. + // if it starts with year we should consume until we hit field with no generated expression + // or we hit a field with generated expression that is of cast or date format. + String expr = currPartitionField.getMetadata().getString(DELTA_GENERATION_EXPRESSION); + ParsedGeneratedExpr parsedGeneratedExpr = + ParsedGeneratedExpr.buildFromString(currPartitionField.getName(), expr); + if (ParsedGeneratedExpr.GeneratedExprType.CAST == parsedGeneratedExpr.generatedExprType) { + partitionFields.add( + getPartitionWithDateTransform( + currPartitionField.getName(), parsedGeneratedExpr, internalSchema)); + itr.next(); // consume the field. + } else if (ParsedGeneratedExpr.GeneratedExprType.DATE_FORMAT + == parsedGeneratedExpr.generatedExprType) { + partitionFields.add( + getPartitionWithDateFormatTransform( + currPartitionField.getName(), parsedGeneratedExpr, internalSchema)); + itr.next(); // consume the field. + } else { + // consume until we hit field with no generated expression or generated expression + // that is not of type cast or date format. + List parsedGeneratedExprs = new ArrayList<>(); + while (itr.hasNext() + && currPartitionField.getMetadata().contains(DELTA_GENERATION_EXPRESSION)) { + expr = currPartitionField.getMetadata().getString(DELTA_GENERATION_EXPRESSION); + parsedGeneratedExpr = + ParsedGeneratedExpr.buildFromString(currPartitionField.getName(), expr); + + if (ParsedGeneratedExpr.GeneratedExprType.CAST == parsedGeneratedExpr.generatedExprType + || ParsedGeneratedExpr.GeneratedExprType.DATE_FORMAT + == parsedGeneratedExpr.generatedExprType) { + break; + } + parsedGeneratedExprs.add(parsedGeneratedExpr); + itr.next(); // consume the field + if (itr.hasNext()) { + currPartitionField = itr.peek(); + } + } + partitionFields.add( + getPartitionColumnsForHourOrDayOrMonthOrYear(parsedGeneratedExprs, internalSchema)); + } + } + } + return partitionFields; + } + + private InternalPartitionField getPartitionColumnsForHourOrDayOrMonthOrYear( + List parsedGeneratedExprs, InternalSchema internalSchema) { + if (parsedGeneratedExprs.size() > 4) { + throw new IllegalStateException("Invalid partition transform"); + } + validate( + parsedGeneratedExprs, new HashSet<>(GRANULARITIES.subList(0, parsedGeneratedExprs.size()))); + + ParsedGeneratedExpr transform = parsedGeneratedExprs.get(0); + List partitionColumns = + parsedGeneratedExprs.stream() + .map(parsedGeneratedExpr -> parsedGeneratedExpr.partitionColumnName) + .collect(toList(parsedGeneratedExprs.size())); + return InternalPartitionField.builder() + .sourceField( + SchemaFieldFinder.getInstance().findFieldByPath(internalSchema, transform.sourceColumn)) + .partitionFieldNames(partitionColumns) + .transformType( + parsedGeneratedExprs.get(parsedGeneratedExprs.size() - 1) + .internalPartitionTransformType) + .build(); + } + + // Cast has default format of yyyy-MM-dd. + private InternalPartitionField getPartitionWithDateTransform( + String partitionColumnName, + ParsedGeneratedExpr parsedGeneratedExpr, + InternalSchema internalSchema) { + return InternalPartitionField.builder() + .sourceField( + SchemaFieldFinder.getInstance() + .findFieldByPath(internalSchema, parsedGeneratedExpr.sourceColumn)) + .partitionFieldNames(Collections.singletonList(partitionColumnName)) + .transformType(PartitionTransformType.DAY) + .build(); + } + + private InternalPartitionField getPartitionWithDateFormatTransform( + String partitionColumnName, + ParsedGeneratedExpr parsedGeneratedExpr, + InternalSchema internalSchema) { + return InternalPartitionField.builder() + .sourceField( + SchemaFieldFinder.getInstance() + .findFieldByPath(internalSchema, parsedGeneratedExpr.sourceColumn)) + .partitionFieldNames(Collections.singletonList(partitionColumnName)) + .transformType(parsedGeneratedExpr.internalPartitionTransformType) + .build(); + } + + public Map convertToDeltaPartitionFormat( + List partitionFields) { + if (partitionFields == null) { + return null; + } + Map nameToStructFieldMap = new HashMap<>(); + for (InternalPartitionField internalPartitionField : partitionFields) { + String currPartitionColumnName; + StructField field; + + if (internalPartitionField.getTransformType() == PartitionTransformType.VALUE) { + currPartitionColumnName = internalPartitionField.getSourceField().getName(); + field = null; + } else { + // Since partition field of timestamp or bucket type, create new field in schema. + field = getGeneratedField(internalPartitionField); + currPartitionColumnName = field.getName(); + } + nameToStructFieldMap.put(currPartitionColumnName, field); + } + return nameToStructFieldMap; + } + + public Map partitionValueSerialization(InternalDataFile internalDataFile) { + Map partitionValuesSerialized = new HashMap<>(); + if (internalDataFile.getPartitionValues() == null + || internalDataFile.getPartitionValues().isEmpty()) { + return partitionValuesSerialized; + } + for (PartitionValue partitionValue : internalDataFile.getPartitionValues()) { + InternalPartitionField partitionField = partitionValue.getPartitionField(); + PartitionTransformType transformType = partitionField.getTransformType(); + String partitionValueSerialized; + if (transformType == PartitionTransformType.VALUE) { + partitionValueSerialized = + convertToDeltaPartitionValue( + partitionValue.getRange().getMaxValue(), + partitionField.getSourceField().getSchema().getDataType(), + transformType, + ""); + partitionValuesSerialized.put( + partitionField.getSourceField().getName(), partitionValueSerialized); + } else if (transformType == PartitionTransformType.BUCKET) { + partitionValueSerialized = partitionValue.getRange().getMaxValue().toString(); + partitionValuesSerialized.put( + getGeneratedColumnName(partitionField), partitionValueSerialized); + } else { + // use appropriate date formatter for value serialization. + partitionValueSerialized = + convertToDeltaPartitionValue( + partitionValue.getRange().getMaxValue(), + partitionField.getSourceField().getSchema().getDataType(), + transformType, + getDateFormat(partitionField.getTransformType())); + partitionValuesSerialized.put( + getGeneratedColumnName(partitionField), partitionValueSerialized); + } + } + return partitionValuesSerialized; + } + + public List partitionValueExtraction( + scala.collection.Map values, List partitionFields) { + return partitionFields.stream() + .map( + partitionField -> { + PartitionTransformType partitionTransformType = partitionField.getTransformType(); + String dateFormat = + partitionTransformType.isTimeBased() + ? getDateFormat(partitionTransformType) + : null; + String serializedValue = + getSerializedPartitionValue(convertScalaMapToJavaMap(values), partitionField); + Object partitionValue = + convertFromDeltaPartitionValue( + serializedValue, + partitionField.getSourceField().getSchema().getDataType(), + partitionField.getTransformType(), + dateFormat); + return PartitionValue.builder() + .partitionField(partitionField) + .range(Range.scalar(partitionValue)) + .build(); + }) + .collect(toList(partitionFields.size())); + } + + private String getSerializedPartitionValue( + Map values, InternalPartitionField partitionField) { + if (partitionField.getPartitionFieldNames() == null + || partitionField.getPartitionFieldNames().isEmpty()) { + return values.getOrDefault(partitionField.getSourceField().getName(), null); + } + List partitionFieldNames = partitionField.getPartitionFieldNames(); + if (partitionFieldNames.size() == 1) { + return values.getOrDefault(partitionFieldNames.get(0), null); + } + return partitionFieldNames.stream() + .map(name -> values.get(name)) + .collect(Collectors.joining("-")); + } + + private String getGeneratedColumnName(InternalPartitionField internalPartitionField) { + return String.format( + DELTA_PARTITION_COL_NAME_FORMAT, + internalPartitionField.getTransformType().toString(), + internalPartitionField.getSourceField().getName()); + } + + private String getDateFormat(PartitionTransformType transformType) { + switch (transformType) { + case YEAR: + return DATE_FORMAT_FOR_YEAR; + case MONTH: + return DATE_FORMAT_FOR_MONTH; + case DAY: + return DATE_FORMAT_FOR_DAY; + case HOUR: + return DATE_FORMAT_FOR_HOUR; + default: + throw new PartitionSpecException("Invalid transform type"); + } + } + + private StructField getGeneratedField(InternalPartitionField internalPartitionField) { + String generatedExpression; + DataType dataType; + String currPartitionColumnName = getGeneratedColumnName(internalPartitionField); + switch (internalPartitionField.getTransformType()) { + case YEAR: + generatedExpression = + String.format(YEAR_FUNCTION, internalPartitionField.getSourceField().getPath()); + dataType = IntegerType.INTEGER; + break; + case MONTH: + case HOUR: + generatedExpression = + String.format( + DATE_FORMAT_FUNCTION, + internalPartitionField.getSourceField().getPath(), + getDateFormat(internalPartitionField.getTransformType())); + dataType = IntegerType.INTEGER; + break; + case DAY: + generatedExpression = + String.format(CAST_FUNCTION, internalPartitionField.getSourceField().getPath()); + dataType = DateType.DATE; + break; + case BUCKET: + generatedExpression = + String.format( + BUCKET_FUNCTION, + internalPartitionField.getSourceField().getPath(), + Integer.MAX_VALUE, + (int) + internalPartitionField + .getTransformOptions() + .get(InternalPartitionField.NUM_BUCKETS)); + dataType = IntegerType.INTEGER; + break; + default: + throw new PartitionSpecException("Invalid transform type"); + } + Map generatedExpressionMetadata = + Collections.singletonMap(DELTA_GENERATION_EXPRESSION, generatedExpression); + Metadata partitionFieldMetadata = + new Metadata(ScalaUtils.convertJavaMapToScala(generatedExpressionMetadata)); + return new StructField(currPartitionColumnName, dataType, true, FieldMetadata.empty()); + } + + private void validate( + List parsedGeneratedExprs, + Set expectedTypesToBePresent) { + Set sourceFields = + parsedGeneratedExprs.stream().map(expr -> expr.sourceColumn).collect(Collectors.toSet()); + if (sourceFields.size() > 1) { + log.error( + String.format("Multiple source columns found for partition transform: %s", sourceFields)); + throw new PartitionSpecException( + String.format("Multiple source columns found for partition transform: %s", sourceFields)); + } + Set actualTypesPresent = + parsedGeneratedExprs.stream() + .map(expr -> expr.generatedExprType) + .collect(Collectors.toSet()); + if (!actualTypesPresent.equals(expectedTypesToBePresent)) { + log.error( + "Mismatched types present. Expected: " + + expectedTypesToBePresent + + ", Found: " + + actualTypesPresent); + throw new PartitionSpecException( + "Mismatched types present. Expected: " + + expectedTypesToBePresent + + ", Found: " + + actualTypesPresent); + } + } + + private Map convertScalaMapToJavaMap( + scala.collection.Map scalaMap) { + return JavaConverters.mapAsJavaMapConverter(scalaMap).asJava(); + } + + @Builder + static class ParsedGeneratedExpr { + private static final Pattern YEAR_PATTERN = Pattern.compile("YEAR\\(([^)]+)\\)"); + private static final Pattern MONTH_PATTERN = Pattern.compile("MONTH\\(([^)]+)\\)"); + private static final Pattern DAY_PATTERN = Pattern.compile("DAY\\(([^)]+)\\)"); + private static final Pattern HOUR_PATTERN = Pattern.compile("HOUR\\(([^)]+)\\)"); + private static final Pattern CAST_PATTERN = Pattern.compile("CAST\\(([^ ]+) AS DATE\\)"); + private static final Pattern DATE_FORMAT_PATTERN = + Pattern.compile("DATE_FORMAT\\(([^,]+),[^']+'([^']+)'\\)"); + + enum GeneratedExprType { + YEAR, + MONTH, + DAY, + HOUR, + CAST, + DATE_FORMAT + } + + String sourceColumn; + String partitionColumnName; + GeneratedExprType generatedExprType; + PartitionTransformType internalPartitionTransformType; + + private static ParsedGeneratedExpr buildFromString(String partitionColumnName, String expr) { + if (expr.contains("YEAR")) { + return ParsedGeneratedExpr.builder() + .generatedExprType(GeneratedExprType.YEAR) + .partitionColumnName(partitionColumnName) + .sourceColumn(extractColumnName(expr, YEAR_PATTERN)) + .internalPartitionTransformType(PartitionTransformType.YEAR) + .build(); + } else if (expr.contains("MONTH")) { + return ParsedGeneratedExpr.builder() + .generatedExprType(GeneratedExprType.MONTH) + .partitionColumnName(partitionColumnName) + .sourceColumn(extractColumnName(expr, MONTH_PATTERN)) + .internalPartitionTransformType(PartitionTransformType.MONTH) + .build(); + } else if (expr.contains("DAY")) { + return ParsedGeneratedExpr.builder() + .generatedExprType(GeneratedExprType.DAY) + .partitionColumnName(partitionColumnName) + .sourceColumn(extractColumnName(expr, DAY_PATTERN)) + .internalPartitionTransformType(PartitionTransformType.DAY) + .build(); + } else if (expr.contains("HOUR")) { + return ParsedGeneratedExpr.builder() + .generatedExprType(GeneratedExprType.HOUR) + .partitionColumnName(partitionColumnName) + .sourceColumn(extractColumnName(expr, HOUR_PATTERN)) + .internalPartitionTransformType(PartitionTransformType.HOUR) + .build(); + } else if (expr.contains("CAST")) { + return ParsedGeneratedExpr.builder() + .generatedExprType(GeneratedExprType.CAST) + .partitionColumnName(partitionColumnName) + .sourceColumn(extractColumnName(expr, CAST_PATTERN)) + .internalPartitionTransformType(PartitionTransformType.DAY) + .build(); + } else if (expr.contains("DATE_FORMAT")) { + Matcher matcher = DATE_FORMAT_PATTERN.matcher(expr); + if (matcher.find()) { + /* + * from DATE_FORMAT(source_col, 'yyyy-MM-dd-HH') the code below extracts yyyy-MM-dd-HH. + */ + String fieldName = matcher.group(1); + String dateFormatExpr = matcher.group(2); + return ParsedGeneratedExpr.builder() + .generatedExprType(GeneratedExprType.DATE_FORMAT) + .partitionColumnName(partitionColumnName) + .sourceColumn(fieldName) + .internalPartitionTransformType(computeInternalPartitionTransform(dateFormatExpr)) + .build(); + } else { + throw new IllegalArgumentException("Could not extract values from: " + expr); + } + } else { + throw new IllegalArgumentException( + "Unsupported expression for generated expression: " + expr); + } + } + + // Supporting granularity as per https://docs.databricks.com/en/delta/generated-columns.html + private static PartitionTransformType computeInternalPartitionTransform(String dateFormatExpr) { + if (DATE_FORMAT_FOR_HOUR.equals(dateFormatExpr)) { + return PartitionTransformType.HOUR; + } else if (DATE_FORMAT_FOR_DAY.equals(dateFormatExpr)) { + return PartitionTransformType.DAY; + } else if (DATE_FORMAT_FOR_MONTH.equals(dateFormatExpr)) { + return PartitionTransformType.MONTH; + } else { + throw new IllegalArgumentException( + String.format( + "Unsupported date format expression: %s for generated expression", dateFormatExpr)); + } + } + + private static String extractColumnName(String expr, Pattern regexPattern) { + Matcher matcher = regexPattern.matcher(expr); + if (matcher.find()) { + return matcher.group(1).trim(); + } + throw new IllegalArgumentException( + "Could not extract column name from: " + + expr + + " using pattern: " + + regexPattern.pattern()); + } + } +} diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java new file mode 100644 index 000000000..bedb67ad1 --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java @@ -0,0 +1,310 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.delta; + +import java.io.IOException; +import java.util.*; +import java.util.function.Function; +import java.util.stream.Collectors; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Value; +import lombok.extern.log4j.Log4j2; + +import org.apache.commons.lang3.StringUtils; + +import com.fasterxml.jackson.annotation.JsonAnySetter; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.annotations.VisibleForTesting; + +import io.delta.kernel.statistics.DataFileStatistics; +import io.delta.kernel.utils.DataFileStatus; + +import org.apache.xtable.collectors.CustomCollectors; +import org.apache.xtable.model.exception.ParseException; +import org.apache.xtable.model.schema.InternalField; +import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.schema.InternalType; +import org.apache.xtable.model.stat.ColumnStat; +import org.apache.xtable.model.stat.FileStats; +import org.apache.xtable.model.stat.Range; + +/** + * DeltaStatsExtractor extracts column stats and also responsible for their serialization leveraging + * {@link DeltaValueConverter}. + */ +@Log4j2 +@NoArgsConstructor(access = AccessLevel.PRIVATE) +public class DeltaKernelStatsExtractor { + private static final Set FIELD_TYPES_WITH_STATS_SUPPORT = + new HashSet<>( + Arrays.asList( + InternalType.BOOLEAN, + InternalType.DATE, + InternalType.DECIMAL, + InternalType.DOUBLE, + InternalType.INT, + InternalType.LONG, + InternalType.FLOAT, + InternalType.STRING, + InternalType.TIMESTAMP, + InternalType.TIMESTAMP_NTZ)); + + private static final DeltaKernelStatsExtractor INSTANCE = new DeltaKernelStatsExtractor(); + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + /* this data structure collects type names of all unrecognized Delta Lake stats. For instance + data file stats in presence of delete vectors would contain 'tightBounds' stat which is + currently not handled by XTable */ + private final Set unsupportedStats = new HashSet<>(); + + public static DeltaKernelStatsExtractor getInstance() { + return INSTANCE; + } + + public String convertStatsToDeltaFormat( + InternalSchema schema, long numRecords, List columnStats) + throws JsonProcessingException { + DeltaStats.DeltaStatsBuilder deltaStatsBuilder = DeltaStats.builder(); + deltaStatsBuilder.numRecords(numRecords); + if (columnStats == null) { + return MAPPER.writeValueAsString(deltaStatsBuilder.build()); + } + Set validPaths = getPathsFromStructSchemaForMinAndMaxStats(schema); + List validColumnStats = + columnStats.stream() + .filter(stat -> validPaths.contains(stat.getField().getPath())) + .collect(Collectors.toList()); + DeltaStats deltaStats = + deltaStatsBuilder + .minValues(getMinValues(validColumnStats)) + .maxValues(getMaxValues(validColumnStats)) + .nullCount(getNullCount(validColumnStats)) + .build(); + return MAPPER.writeValueAsString(deltaStats); + } + + private Set getPathsFromStructSchemaForMinAndMaxStats(InternalSchema schema) { + return schema.getAllFields().stream() + .filter( + field -> { + InternalType type = field.getSchema().getDataType(); + return FIELD_TYPES_WITH_STATS_SUPPORT.contains(type); + }) + .map(InternalField::getPath) + .collect(Collectors.toSet()); + } + + private Map getMinValues(List validColumnStats) { + return getValues(validColumnStats, columnStat -> columnStat.getRange().getMinValue()); + } + + private Map getMaxValues(List validColumnStats) { + return getValues(validColumnStats, columnStat -> columnStat.getRange().getMaxValue()); + } + + private Map getValues( + List validColumnStats, Function valueExtractor) { + Map jsonObject = new HashMap<>(); + validColumnStats.forEach( + columnStat -> { + InternalField field = columnStat.getField(); + String[] pathParts = field.getPathParts(); + insertValueAtPath( + jsonObject, + pathParts, + DeltaValueConverter.convertToDeltaColumnStatValue( + valueExtractor.apply(columnStat), field.getSchema())); + }); + return jsonObject; + } + + private Map getNullCount(List validColumnStats) { + // TODO: Additional work needed to track nulls maps & arrays. + Map jsonObject = new HashMap<>(); + validColumnStats.forEach( + columnStat -> { + String[] pathParts = columnStat.getField().getPathParts(); + insertValueAtPath(jsonObject, pathParts, columnStat.getNumNulls()); + }); + return jsonObject; + } + + private void insertValueAtPath(Map jsonObject, String[] pathParts, Object value) { + if (pathParts == null || pathParts.length == 0) { + return; + } + Map currObject = jsonObject; + for (int i = 0; i < pathParts.length; i++) { + String part = pathParts[i]; + if (i == pathParts.length - 1) { + currObject.put(part, value); + } else { + if (!currObject.containsKey(part)) { + currObject.put(part, new HashMap()); + } + try { + currObject = (HashMap) currObject.get(part); + } catch (ClassCastException e) { + throw new RuntimeException( + String.format( + "Cannot cast to hashmap while inserting stats at path %s", + String.join("->", pathParts)), + e); + } + } + } + } + + public FileStats getColumnStatsForFile(DataFileStatus addFile, List fields) { + + Optional statsOpt = addFile.getStatistics().map(DataFileStatistics::toString); + System.out.println("statsOpt:" + statsOpt); + if (!statsOpt.isPresent() || StringUtils.isEmpty(statsOpt.get())) { + System.out.println("No statistics available1"); + // No statistics available + return FileStats.builder().columnStats(Collections.emptyList()).numRecords(0).build(); + } + // TODO: Additional work needed to track maps & arrays. + try { + DeltaStats deltaStats = + MAPPER.readValue(addFile.getStatistics().get().toString(), DeltaStats.class); + System.out.println("deltaStats:" + deltaStats); + collectUnsupportedStats(deltaStats.getAdditionalStats()); + + Map fieldPathToMaxValue = flattenStatMap(deltaStats.getMaxValues()); + Map fieldPathToMinValue = flattenStatMap(deltaStats.getMinValues()); + Map fieldPathToNullCount = flattenStatMap(deltaStats.getNullCount()); + List columnStats = + fields.stream() + .filter(field -> fieldPathToMaxValue.containsKey(field.getPath())) + .map( + field -> { + String fieldPath = field.getPath(); + Object minValue = + DeltaValueConverter.convertFromDeltaColumnStatValue( + fieldPathToMinValue.get(fieldPath), field.getSchema()); + Object maxValue = + DeltaValueConverter.convertFromDeltaColumnStatValue( + fieldPathToMaxValue.get(fieldPath), field.getSchema()); + Number nullCount = (Number) fieldPathToNullCount.get(fieldPath); + Range range = Range.vector(minValue, maxValue); + return ColumnStat.builder() + .field(field) + .numValues(deltaStats.getNumRecords()) + .numNulls(nullCount.longValue()) + .range(range) + .build(); + }) + .collect(CustomCollectors.toList(fields.size())); + return FileStats.builder() + .columnStats(columnStats) + .numRecords(deltaStats.getNumRecords()) + .build(); + } catch (IOException ex) { + throw new ParseException("Unable to parse stats json", ex); + } + } + + private void collectUnsupportedStats(Map additionalStats) { + if (additionalStats == null || additionalStats.isEmpty()) { + return; + } + + additionalStats.keySet().stream() + .filter(key -> !unsupportedStats.contains(key)) + .forEach( + key -> { + log.info("Unrecognized/unsupported Delta data file stat: {}", key); + unsupportedStats.add(key); + }); + } + + /** + * Takes the input map which represents a json object and flattens it. + * + * @param statMap input json map + * @return map with keys representing the dot-path for the field + */ + private Map flattenStatMap(Map statMap) { + Map result = new HashMap<>(); + Queue statFieldQueue = new ArrayDeque<>(); + statFieldQueue.add(StatField.of("", statMap)); + while (!statFieldQueue.isEmpty()) { + StatField statField = statFieldQueue.poll(); + String prefix = statField.getParentPath().isEmpty() ? "" : statField.getParentPath() + "."; + statField + .getValues() + .forEach( + (fieldName, value) -> { + String fullName = prefix + fieldName; + if (value instanceof Map) { + statFieldQueue.add(StatField.of(fullName, (Map) value)); + } else { + result.put(fullName, value); + } + }); + } + return result; + } + + /** + * Returns the names of all unsupported stats that have been discovered during the parsing of + * Delta Lake stats. + * + * @return set of unsupported stats + */ + @VisibleForTesting + Set getUnsupportedStats() { + return Collections.unmodifiableSet(unsupportedStats); + } + + @Builder + @Value + private static class DeltaStats { + long numRecords; + Map minValues; + Map maxValues; + Map nullCount; + + /* this is a catch-all for any additional stats that are not explicitly handled */ + @JsonIgnore + @Getter(lazy = true) + Map additionalStats = new HashMap<>(); + + @JsonAnySetter + public void setAdditionalStat(String key, Object value) { + getAdditionalStats().put(key, value); + } + } + + @Value + @AllArgsConstructor(staticName = "of") + private static class StatField { + String parentPath; + Map values; + } +} diff --git a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java index f56f333b0..958683045 100644 --- a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java @@ -20,6 +20,8 @@ import java.io.IOException; import java.time.Instant; +import java.util.ArrayList; +import java.util.List; import lombok.Builder; @@ -30,29 +32,31 @@ import io.delta.kernel.defaults.engine.DefaultEngine; import io.delta.kernel.engine.Engine; -import org.apache.xtable.delta.DeltaKernelTableExtractor; +import org.apache.xtable.delta.*; import org.apache.xtable.exception.ReadException; import org.apache.xtable.model.*; +import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.storage.InternalDataFile; +import org.apache.xtable.model.storage.PartitionFileGroup; import org.apache.xtable.spi.extractor.ConversionSource; +import org.apache.xtable.spi.extractor.DataFileIterator; @Builder public class DeltaKernelConversionSource implements ConversionSource { + + @Builder.Default + private final DeltaKernelDataFileExtractor dataFileExtractor = + DeltaKernelDataFileExtractor.builder().build(); + private final String basePath; private final String tableName; private final Engine engine; + // private final DeltaKernelTableExtractor tableExtractor; @Builder.Default private final DeltaKernelTableExtractor tableExtractor = DeltaKernelTableExtractor.builder().build(); - // private final DeltaKernelActionsConverter actionsConverter; - - // public DeltaKernelConversionSource(String basePath, String tableName, Engine engine) { - // this.basePath = basePath; - // this.tableName = tableName; - // this.engine = engine; - // - // } @Override public InternalTable getTable(Long version) { @@ -80,7 +84,17 @@ public InternalTable getCurrentTable() { @Override public InternalSnapshot getCurrentSnapshot() { - return null; + Configuration hadoopConf = new Configuration(); + Engine engine = DefaultEngine.create(hadoopConf); + System.out.println("getCurrentSnapshot12: " + basePath); + Table table_snapshot = Table.forPath(engine, basePath); + Snapshot snapshot = table_snapshot.getLatestSnapshot(engine); + InternalTable table = getTable(snapshot.getVersion()); + return InternalSnapshot.builder() + .table(table) + .partitionedDataFiles(getInternalDataFiles(snapshot, table.getReadSchema())) + .sourceIdentifier(getCommitIdentifier(snapshot.getVersion())) + .build(); } @Override @@ -104,6 +118,17 @@ public String getCommitIdentifier(Long aLong) { return ""; } + private List getInternalDataFiles( + io.delta.kernel.Snapshot snapshot, InternalSchema schema) { + try (DataFileIterator fileIterator = dataFileExtractor.iterator(snapshot, schema)) { + List dataFiles = new ArrayList<>(); + fileIterator.forEachRemaining(dataFiles::add); + return PartitionFileGroup.fromFiles(dataFiles); + } catch (Exception e) { + throw new ReadException("Failed to iterate through Delta data files", e); + } + } + @Override public void close() throws IOException {} diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java index 0c67e894a..60e43c859 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -18,20 +18,39 @@ package org.apache.xtable.delta; +import static io.delta.kernel.internal.util.Utils.singletonCloseableIterator; import static org.apache.xtable.testutil.ITTestUtils.validateTable; import static org.junit.jupiter.api.Assertions.*; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.Instant; +import java.time.temporal.ChronoUnit; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Optional; +import io.delta.kernel.Scan; +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.data.FilteredColumnarBatch; +import io.delta.kernel.data.Row; +import io.delta.kernel.internal.InternalScanFileUtils; +import io.delta.kernel.internal.data.ScanStateRow; +import io.delta.kernel.types.StructType; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.utils.FileStatus; import org.apache.hadoop.conf.Configuration; import org.apache.spark.serializer.KryoSerializer; import org.apache.spark.sql.SparkSession; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; +import org.apache.xtable.model.InternalSnapshot; +import org.apache.xtable.model.stat.ColumnStat; +import org.apache.xtable.model.stat.Range; +import org.junit.jupiter.api.*; import org.junit.jupiter.api.io.TempDir; import org.apache.xtable.GenericTable; @@ -45,6 +64,11 @@ import org.apache.xtable.model.storage.DataLayoutStrategy; import org.apache.xtable.model.storage.TableFormat; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.defaults.engine.DefaultEngine; +import io.delta.kernel.engine.Engine; +import io.delta.kernel.*; + public class ITDeltaKernelConversionSource { private static final InternalField COL1_INT_FIELD = InternalField.builder() @@ -75,12 +99,28 @@ public class ITDeltaKernelConversionSource { .name("col3") .schema( InternalSchema.builder() - .name("string") - .dataType(InternalType.STRING) + .name("integer") + .dataType(InternalType.INT) .isNullable(true) .build()) .defaultValue(InternalField.Constants.NULL_DEFAULT_VALUE) .build(); + private static final ColumnStat COL2_COLUMN_STAT = + ColumnStat.builder() + .field(COL2_INT_FIELD) + .range(Range.vector(2, 2)) + .numNulls(0) + .numValues(1) + .totalSize(0) + .build(); + private static final ColumnStat COL1_COLUMN_STAT = + ColumnStat.builder() + .field(COL1_INT_FIELD) + .range(Range.vector(1, 1)) + .numNulls(0) + .numValues(1) + .totalSize(0) + .build(); private DeltaKernelConversionSourceProvider conversionSourceProvider; private static SparkSession sparkSession; @@ -104,7 +144,12 @@ public static void setupOnce() { } @TempDir private static Path tempDir; - + @AfterAll + public static void teardown() { + if (sparkSession != null) { + sparkSession.close(); + } + } @BeforeEach void setUp() { Configuration hadoopConf = new Configuration(); @@ -125,7 +170,7 @@ void getCurrentTableTest() { + tableName + "` USING DELTA LOCATION '" + basePath - + "' AS SELECT * FROM VALUES (1, 2, '3')"); + + "' AS SELECT * FROM VALUES (1, 2, 3)"); // Create Delta source SourceTable tableConfig = SourceTable.builder() @@ -133,19 +178,19 @@ void getCurrentTableTest() { .basePath(basePath.toString()) .formatName(TableFormat.DELTA) .build(); - System.out.println( - "Table Config: " + tableConfig.getBasePath() + ", " + tableConfig.getDataPath()); +// System.out.println( +// "Table Config: " + tableConfig.getBasePath() + ", " + tableConfig.getDataPath()); DeltaKernelConversionSource conversionSource = conversionSourceProvider.getConversionSourceInstance(tableConfig); // Get current table InternalTable internalTable = conversionSource.getCurrentTable(); List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD, COL3_STR_FIELD); - System.out.println("Internal Table: " + internalTable); - System.out.println("Fields: " + fields); - System.out.println("Table Format: " + TableFormat.DELTA); - System.out.println("Data Layout Strategy: " + DataLayoutStrategy.FLAT); - System.out.println("Base Path: " + basePath); - System.out.println("Latest getReadSchema : " + internalTable.getReadSchema()); +// System.out.println("Internal Table: " + internalTable); +// System.out.println("Fields: " + fields); +// System.out.println("Table Format: " + TableFormat.DELTA); +// System.out.println("Data Layout Strategy: " + DataLayoutStrategy.FLAT); +// System.out.println("Base Path: " + basePath); +// System.out.println("Latest getReadSchema : " + internalTable.getReadSchema()); // System.out.println("Latest getLatestMetadataPath : " + InternalSchema); validateTable( internalTable, @@ -161,4 +206,166 @@ void getCurrentTableTest() { internalTable.getLatestMetadataPath(), Collections.emptyList()); } + + @Test + void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { + // Table name + final String tableName = GenericTable.getTableName(); + final Path basePath = tempDir.resolve(tableName); + + System.out.println("Table Name: " + tableName); + System.out.println("Base Path: " + basePath); + // Create table with a single row using Spark + sparkSession.sql( + "CREATE TABLE `" + + tableName + + "` USING DELTA LOCATION '" + + basePath + + "' AS SELECT * FROM VALUES (1, 2)"); + // Create Delta source + SourceTable tableConfig = + SourceTable.builder() + .name(tableName) + .basePath(basePath.toString()) + .formatName(TableFormat.DELTA) + .build(); + DeltaKernelConversionSource conversionSource = + conversionSourceProvider.getConversionSourceInstance(tableConfig); + // Get current snapshot + InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); + +// snapshot.getPartitionedDataFiles().get(0) + // Validate table + List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD); + validateTable( + snapshot.getTable(), + tableName, + TableFormat.DELTA, + InternalSchema.builder() + .name("struct") + .dataType(InternalType.RECORD) + .fields(fields) + .build(), + DataLayoutStrategy.FLAT, + "file://" + basePath, + snapshot.getTable().getLatestMetadataPath(), + Collections.emptyList()); + // Validate data files + List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); + Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); + +// validatePartitionDataFiles( +// PartitionFileGroup.builder() +// .files( +// Collections.singletonList( +// InternalDataFile.builder() +// .physicalPath("file:/fake/path") +// .fileFormat(FileFormat.APACHE_PARQUET) +// .partitionValues(Collections.emptyList()) +// .fileSizeBytes(716) +// .recordCount(1) +// .columnStats(columnStats) +// .build())) +// .partitionValues(Collections.emptyList()) +// .build(), +// snapshot.getPartitionedDataFiles().get(0)); +// System.out.println(snapshot.getPartitionedDataFiles().get(0).getDataFiles()); +// Configuration hadoopConf = new Configuration(); +// Engine myEngine = DefaultEngine.create(hadoopConf); +// Table myTable = Table.forPath(myEngine, basePath.toString()); +// Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); +// Scan myScan = mySnapshot.getScanBuilder().build(); +// +// +// // Common information about scanning for all data files to read. +// Row scanState = myScan.getScanState(myEngine); +// +// // Information about the list of scan files to read +// CloseableIterator fileIter = myScan.getScanFiles(myEngine); +// int readRecordCount = 0; +// try { +// StructType physicalReadSchema = ScanStateRow.getPhysicalDataReadSchema(myEngine, scanState); +// while (fileIter.hasNext()) { +// FilteredColumnarBatch scanFilesBatch = fileIter.next(); +// try (CloseableIterator scanFileRows = scanFilesBatch.getRows()) { +// while (scanFileRows.hasNext()) { +// Row scanFileRow = scanFileRows.next(); +// FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); +// CloseableIterator physicalDataIter = +// myEngine +// .getParquetHandler() +// .readParquetFiles( +// singletonCloseableIterator(fileStatus), +// physicalReadSchema, +// Optional.empty()); +// try (CloseableIterator transformedData = +// Scan.transformPhysicalData(myEngine, scanState, scanFileRow, physicalDataIter)) { +// while (transformedData.hasNext()) { +// FilteredColumnarBatch logicalData = transformedData.next(); +// ColumnarBatch dataBatch = logicalData.getData(); +// +// // access the data for the column at ordinal 0 +// ColumnVector column0 = dataBatch.getColumnVector(0); +// ColumnVector column1 = dataBatch.getColumnVector(1); +//// +//// for (int rowIndex = 0; rowIndex < column0.getSize(); rowIndex++) { +//// System.out.println(column0.getInt(rowIndex)); +//// } +// for (int rowIndex = 0; rowIndex < column1.getSize(); rowIndex++) { +// System.out.println(column1.getInt(rowIndex)); +// } +// } +// } +// } +// } +// } +// } catch (IOException e) { +// e.printStackTrace(); +// System.out.println("IOException occurred: " + e.getMessage()); +// } + +} + private void validatePartitionDataFiles( + PartitionFileGroup expectedPartitionFiles, PartitionFileGroup actualPartitionFiles) + throws URISyntaxException { + assertEquals( + expectedPartitionFiles.getPartitionValues(), actualPartitionFiles.getPartitionValues()); + validateDataFiles(expectedPartitionFiles.getDataFiles(), actualPartitionFiles.getDataFiles()); + } + private void validateDataFiles( + List expectedFiles, List actualFiles) + throws URISyntaxException { + Assertions.assertEquals(expectedFiles.size(), actualFiles.size()); + for (int i = 0; i < expectedFiles.size(); i++) { + InternalDataFile expected = expectedFiles.get(i); + InternalDataFile actual = actualFiles.get(i); + validatePropertiesDataFile(expected, actual); + } + } + private void validatePropertiesDataFile(InternalDataFile expected, InternalDataFile actual) + throws URISyntaxException { + Assertions.assertTrue( + Paths.get(new URI(actual.getPhysicalPath()).getPath()).isAbsolute(), + () -> "path == " + actual.getPhysicalPath() + " is not absolute"); + Assertions.assertEquals(expected.getFileFormat(), actual.getFileFormat()); + Assertions.assertEquals(expected.getPartitionValues(), actual.getPartitionValues()); + Assertions.assertEquals(expected.getFileSizeBytes(), actual.getFileSizeBytes()); + System.out.println("Expected File Size: " + expected); + System.out.println("Actual File Size: " + actual); +// Assertions.assertEquals(expected.getRecordCount(), actual.getRecordCount()); +// Instant now = Instant.now(); +// long minRange = now.minus(1, ChronoUnit.HOURS).toEpochMilli(); +// long maxRange = now.toEpochMilli(); +// Assertions.assertTrue( +// actual.getLastModified() > minRange && actual.getLastModified() <= maxRange, +// () -> +// "last modified == " +// + actual.getLastModified() +// + " is expected between " +// + minRange +// + " and " +// + maxRange); +// Assertions.assertEquals(expected.getColumnStats(), actual.getColumnStats()); + } + } diff --git a/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java b/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java index e760d1721..ca1b32ca5 100644 --- a/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java +++ b/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java @@ -48,7 +48,7 @@ public static void validateTable( String basePath, String latestMetadataPath, List partitioningFields) { - System.out.println("readSchema " + readSchema); + Assertions.assertEquals(tableName, internalTable.getName()); Assertions.assertEquals(tableFormat, internalTable.getTableFormat()); Assertions.assertEquals(readSchema, internalTable.getReadSchema()); From 18ab9d6a06ad97713ccae83a5c604db2e09d9111 Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Sun, 20 Jul 2025 00:08:40 +0530 Subject: [PATCH 09/16] spotless fix --- .../delta/DeltaKernelActionsConverter.java | 9 +- .../delta/DeltaKernelDataFileExtractor.java | 2 - .../delta/ITDeltaKernelConversionSource.java | 310 +++++++++--------- 3 files changed, 154 insertions(+), 167 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java index 9cdd5305d..7e87d2203 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java @@ -25,7 +25,6 @@ import java.util.Map; import java.util.Optional; -import io.delta.kernel.statistics.DataFileStatistics; import lombok.AccessLevel; import lombok.NoArgsConstructor; @@ -68,15 +67,15 @@ public InternalDataFile convertAddActionToInternalDataFile( DeltaKernelPartitionExtractor partitionExtractor, DeltaKernelStatsExtractor fileStatsExtractor, Map partitionValues) { - DataFileStatus dataFileStatus = new DataFileStatus( + DataFileStatus dataFileStatus = + new DataFileStatus( addFile.getPath(), addFile.getModificationTime(), addFile.getSize(), Optional.empty() // or Optional.empty() if not available - ); + ); System.out.println("dataFileStatus:" + dataFileStatus); - FileStats fileStats = - fileStatsExtractor.getColumnStatsForFile(dataFileStatus, fields); + FileStats fileStats = fileStatsExtractor.getColumnStatsForFile(dataFileStatus, fields); System.out.println("fileStats:" + fileStats); List columnStats = includeColumnStats ? fileStats.getColumnStats() : Collections.emptyList(); diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java index adafea57d..ddb3b7782 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java @@ -22,14 +22,12 @@ import java.util.*; import java.util.stream.Collectors; -import io.delta.kernel.internal.actions.AddFile; import lombok.Builder; import org.apache.hadoop.conf.Configuration; import io.delta.kernel.Scan; import io.delta.kernel.Snapshot; -import io.delta.kernel.Table; import io.delta.kernel.data.FilteredColumnarBatch; import io.delta.kernel.data.Row; import io.delta.kernel.defaults.engine.DefaultEngine; diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java index 60e43c859..3ddb89762 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -18,11 +18,9 @@ package org.apache.xtable.delta; -import static io.delta.kernel.internal.util.Utils.singletonCloseableIterator; import static org.apache.xtable.testutil.ITTestUtils.validateTable; import static org.junit.jupiter.api.Assertions.*; -import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.nio.file.Path; @@ -32,43 +30,29 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Optional; -import io.delta.kernel.Scan; -import io.delta.kernel.data.ColumnVector; -import io.delta.kernel.data.ColumnarBatch; -import io.delta.kernel.data.FilteredColumnarBatch; -import io.delta.kernel.data.Row; -import io.delta.kernel.internal.InternalScanFileUtils; -import io.delta.kernel.internal.data.ScanStateRow; -import io.delta.kernel.types.StructType; -import io.delta.kernel.utils.CloseableIterator; -import io.delta.kernel.utils.FileStatus; import org.apache.hadoop.conf.Configuration; import org.apache.spark.serializer.KryoSerializer; import org.apache.spark.sql.SparkSession; -import org.apache.xtable.model.InternalSnapshot; -import org.apache.xtable.model.stat.ColumnStat; -import org.apache.xtable.model.stat.Range; import org.junit.jupiter.api.*; import org.junit.jupiter.api.io.TempDir; +import io.delta.kernel.*; + import org.apache.xtable.GenericTable; import org.apache.xtable.conversion.SourceTable; import org.apache.xtable.kernel.DeltaKernelConversionSource; +import org.apache.xtable.model.InternalSnapshot; import org.apache.xtable.model.InternalTable; import org.apache.xtable.model.schema.InternalField; import org.apache.xtable.model.schema.InternalSchema; import org.apache.xtable.model.schema.InternalType; +import org.apache.xtable.model.stat.ColumnStat; +import org.apache.xtable.model.stat.Range; import org.apache.xtable.model.storage.*; import org.apache.xtable.model.storage.DataLayoutStrategy; import org.apache.xtable.model.storage.TableFormat; -import io.delta.kernel.utils.CloseableIterator; -import io.delta.kernel.defaults.engine.DefaultEngine; -import io.delta.kernel.engine.Engine; -import io.delta.kernel.*; - public class ITDeltaKernelConversionSource { private static final InternalField COL1_INT_FIELD = InternalField.builder() @@ -106,21 +90,21 @@ public class ITDeltaKernelConversionSource { .defaultValue(InternalField.Constants.NULL_DEFAULT_VALUE) .build(); private static final ColumnStat COL2_COLUMN_STAT = - ColumnStat.builder() - .field(COL2_INT_FIELD) - .range(Range.vector(2, 2)) - .numNulls(0) - .numValues(1) - .totalSize(0) - .build(); + ColumnStat.builder() + .field(COL2_INT_FIELD) + .range(Range.vector(2, 2)) + .numNulls(0) + .numValues(1) + .totalSize(0) + .build(); private static final ColumnStat COL1_COLUMN_STAT = - ColumnStat.builder() - .field(COL1_INT_FIELD) - .range(Range.vector(1, 1)) - .numNulls(0) - .numValues(1) - .totalSize(0) - .build(); + ColumnStat.builder() + .field(COL1_INT_FIELD) + .range(Range.vector(1, 1)) + .numNulls(0) + .numValues(1) + .totalSize(0) + .build(); private DeltaKernelConversionSourceProvider conversionSourceProvider; private static SparkSession sparkSession; @@ -144,12 +128,14 @@ public static void setupOnce() { } @TempDir private static Path tempDir; + @AfterAll public static void teardown() { if (sparkSession != null) { sparkSession.close(); } } + @BeforeEach void setUp() { Configuration hadoopConf = new Configuration(); @@ -178,19 +164,19 @@ void getCurrentTableTest() { .basePath(basePath.toString()) .formatName(TableFormat.DELTA) .build(); -// System.out.println( -// "Table Config: " + tableConfig.getBasePath() + ", " + tableConfig.getDataPath()); + // System.out.println( + // "Table Config: " + tableConfig.getBasePath() + ", " + tableConfig.getDataPath()); DeltaKernelConversionSource conversionSource = conversionSourceProvider.getConversionSourceInstance(tableConfig); // Get current table InternalTable internalTable = conversionSource.getCurrentTable(); List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD, COL3_STR_FIELD); -// System.out.println("Internal Table: " + internalTable); -// System.out.println("Fields: " + fields); -// System.out.println("Table Format: " + TableFormat.DELTA); -// System.out.println("Data Layout Strategy: " + DataLayoutStrategy.FLAT); -// System.out.println("Base Path: " + basePath); -// System.out.println("Latest getReadSchema : " + internalTable.getReadSchema()); + // System.out.println("Internal Table: " + internalTable); + // System.out.println("Fields: " + fields); + // System.out.println("Table Format: " + TableFormat.DELTA); + // System.out.println("Data Layout Strategy: " + DataLayoutStrategy.FLAT); + // System.out.println("Base Path: " + basePath); + // System.out.println("Latest getReadSchema : " + internalTable.getReadSchema()); // System.out.println("Latest getLatestMetadataPath : " + InternalSchema); validateTable( internalTable, @@ -217,124 +203,128 @@ void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { System.out.println("Base Path: " + basePath); // Create table with a single row using Spark sparkSession.sql( - "CREATE TABLE `" - + tableName - + "` USING DELTA LOCATION '" - + basePath - + "' AS SELECT * FROM VALUES (1, 2)"); + "CREATE TABLE `" + + tableName + + "` USING DELTA LOCATION '" + + basePath + + "' AS SELECT * FROM VALUES (1, 2)"); // Create Delta source SourceTable tableConfig = - SourceTable.builder() - .name(tableName) - .basePath(basePath.toString()) - .formatName(TableFormat.DELTA) - .build(); + SourceTable.builder() + .name(tableName) + .basePath(basePath.toString()) + .formatName(TableFormat.DELTA) + .build(); DeltaKernelConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); + conversionSourceProvider.getConversionSourceInstance(tableConfig); // Get current snapshot InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); -// snapshot.getPartitionedDataFiles().get(0) + // snapshot.getPartitionedDataFiles().get(0) // Validate table List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD); validateTable( - snapshot.getTable(), - tableName, - TableFormat.DELTA, - InternalSchema.builder() - .name("struct") - .dataType(InternalType.RECORD) - .fields(fields) - .build(), - DataLayoutStrategy.FLAT, - "file://" + basePath, - snapshot.getTable().getLatestMetadataPath(), - Collections.emptyList()); + snapshot.getTable(), + tableName, + TableFormat.DELTA, + InternalSchema.builder() + .name("struct") + .dataType(InternalType.RECORD) + .fields(fields) + .build(), + DataLayoutStrategy.FLAT, + "file://" + basePath, + snapshot.getTable().getLatestMetadataPath(), + Collections.emptyList()); // Validate data files List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); -// validatePartitionDataFiles( -// PartitionFileGroup.builder() -// .files( -// Collections.singletonList( -// InternalDataFile.builder() -// .physicalPath("file:/fake/path") -// .fileFormat(FileFormat.APACHE_PARQUET) -// .partitionValues(Collections.emptyList()) -// .fileSizeBytes(716) -// .recordCount(1) -// .columnStats(columnStats) -// .build())) -// .partitionValues(Collections.emptyList()) -// .build(), -// snapshot.getPartitionedDataFiles().get(0)); -// System.out.println(snapshot.getPartitionedDataFiles().get(0).getDataFiles()); -// Configuration hadoopConf = new Configuration(); -// Engine myEngine = DefaultEngine.create(hadoopConf); -// Table myTable = Table.forPath(myEngine, basePath.toString()); -// Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); -// Scan myScan = mySnapshot.getScanBuilder().build(); -// -// -// // Common information about scanning for all data files to read. -// Row scanState = myScan.getScanState(myEngine); -// -// // Information about the list of scan files to read -// CloseableIterator fileIter = myScan.getScanFiles(myEngine); -// int readRecordCount = 0; -// try { -// StructType physicalReadSchema = ScanStateRow.getPhysicalDataReadSchema(myEngine, scanState); -// while (fileIter.hasNext()) { -// FilteredColumnarBatch scanFilesBatch = fileIter.next(); -// try (CloseableIterator scanFileRows = scanFilesBatch.getRows()) { -// while (scanFileRows.hasNext()) { -// Row scanFileRow = scanFileRows.next(); -// FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); -// CloseableIterator physicalDataIter = -// myEngine -// .getParquetHandler() -// .readParquetFiles( -// singletonCloseableIterator(fileStatus), -// physicalReadSchema, -// Optional.empty()); -// try (CloseableIterator transformedData = -// Scan.transformPhysicalData(myEngine, scanState, scanFileRow, physicalDataIter)) { -// while (transformedData.hasNext()) { -// FilteredColumnarBatch logicalData = transformedData.next(); -// ColumnarBatch dataBatch = logicalData.getData(); -// -// // access the data for the column at ordinal 0 -// ColumnVector column0 = dataBatch.getColumnVector(0); -// ColumnVector column1 = dataBatch.getColumnVector(1); -//// -//// for (int rowIndex = 0; rowIndex < column0.getSize(); rowIndex++) { -//// System.out.println(column0.getInt(rowIndex)); -//// } -// for (int rowIndex = 0; rowIndex < column1.getSize(); rowIndex++) { -// System.out.println(column1.getInt(rowIndex)); -// } -// } -// } -// } -// } -// } -// } catch (IOException e) { -// e.printStackTrace(); -// System.out.println("IOException occurred: " + e.getMessage()); -// } + validatePartitionDataFiles( + PartitionFileGroup.builder() + .files( + Collections.singletonList( + InternalDataFile.builder() + .physicalPath("file:/fake/path") + .fileFormat(FileFormat.APACHE_PARQUET) + .partitionValues(Collections.emptyList()) + .fileSizeBytes(716) + .recordCount(1) + .columnStats(columnStats) + .build())) + .partitionValues(Collections.emptyList()) + .build(), + snapshot.getPartitionedDataFiles().get(0)); + // System.out.println(snapshot.getPartitionedDataFiles().get(0).getDataFiles()); + // Configuration hadoopConf = new Configuration(); + // Engine myEngine = DefaultEngine.create(hadoopConf); + // Table myTable = Table.forPath(myEngine, basePath.toString()); + // Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); + // Scan myScan = mySnapshot.getScanBuilder().build(); + // + // + // // Common information about scanning for all data files to read. + // Row scanState = myScan.getScanState(myEngine); + // + // // Information about the list of scan files to read + // CloseableIterator fileIter = myScan.getScanFiles(myEngine); + // int readRecordCount = 0; + // try { + // StructType physicalReadSchema = ScanStateRow.getPhysicalDataReadSchema(myEngine, + // scanState); + // while (fileIter.hasNext()) { + // FilteredColumnarBatch scanFilesBatch = fileIter.next(); + // try (CloseableIterator scanFileRows = scanFilesBatch.getRows()) { + // while (scanFileRows.hasNext()) { + // Row scanFileRow = scanFileRows.next(); + // FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); + // CloseableIterator physicalDataIter = + // myEngine + // .getParquetHandler() + // .readParquetFiles( + // singletonCloseableIterator(fileStatus), + // physicalReadSchema, + // Optional.empty()); + // try (CloseableIterator transformedData = + // Scan.transformPhysicalData(myEngine, scanState, scanFileRow, + // physicalDataIter)) { + // while (transformedData.hasNext()) { + // FilteredColumnarBatch logicalData = transformedData.next(); + // ColumnarBatch dataBatch = logicalData.getData(); + // + // // access the data for the column at ordinal 0 + // ColumnVector column0 = dataBatch.getColumnVector(0); + // ColumnVector column1 = dataBatch.getColumnVector(1); + //// + //// for (int rowIndex = 0; rowIndex < column0.getSize(); rowIndex++) { + //// System.out.println(column0.getInt(rowIndex)); + //// } + // for (int rowIndex = 0; rowIndex < column1.getSize(); rowIndex++) { + // System.out.println(column1.getInt(rowIndex)); + // } + // } + // } + // } + // } + // } + // } catch (IOException e) { + // e.printStackTrace(); + // System.out.println("IOException occurred: " + e.getMessage()); + // } + + } -} private void validatePartitionDataFiles( - PartitionFileGroup expectedPartitionFiles, PartitionFileGroup actualPartitionFiles) - throws URISyntaxException { + PartitionFileGroup expectedPartitionFiles, PartitionFileGroup actualPartitionFiles) + throws URISyntaxException { assertEquals( - expectedPartitionFiles.getPartitionValues(), actualPartitionFiles.getPartitionValues()); + expectedPartitionFiles.getPartitionValues(), actualPartitionFiles.getPartitionValues()); validateDataFiles(expectedPartitionFiles.getDataFiles(), actualPartitionFiles.getDataFiles()); } + private void validateDataFiles( - List expectedFiles, List actualFiles) - throws URISyntaxException { + List expectedFiles, List actualFiles) + throws URISyntaxException { Assertions.assertEquals(expectedFiles.size(), actualFiles.size()); for (int i = 0; i < expectedFiles.size(); i++) { InternalDataFile expected = expectedFiles.get(i); @@ -342,30 +332,30 @@ private void validateDataFiles( validatePropertiesDataFile(expected, actual); } } + private void validatePropertiesDataFile(InternalDataFile expected, InternalDataFile actual) - throws URISyntaxException { + throws URISyntaxException { Assertions.assertTrue( - Paths.get(new URI(actual.getPhysicalPath()).getPath()).isAbsolute(), - () -> "path == " + actual.getPhysicalPath() + " is not absolute"); + Paths.get(new URI(actual.getPhysicalPath()).getPath()).isAbsolute(), + () -> "path == " + actual.getPhysicalPath() + " is not absolute"); Assertions.assertEquals(expected.getFileFormat(), actual.getFileFormat()); Assertions.assertEquals(expected.getPartitionValues(), actual.getPartitionValues()); Assertions.assertEquals(expected.getFileSizeBytes(), actual.getFileSizeBytes()); System.out.println("Expected File Size: " + expected); System.out.println("Actual File Size: " + actual); -// Assertions.assertEquals(expected.getRecordCount(), actual.getRecordCount()); -// Instant now = Instant.now(); -// long minRange = now.minus(1, ChronoUnit.HOURS).toEpochMilli(); -// long maxRange = now.toEpochMilli(); -// Assertions.assertTrue( -// actual.getLastModified() > minRange && actual.getLastModified() <= maxRange, -// () -> -// "last modified == " -// + actual.getLastModified() -// + " is expected between " -// + minRange -// + " and " -// + maxRange); -// Assertions.assertEquals(expected.getColumnStats(), actual.getColumnStats()); + // Assertions.assertEquals(expected.getRecordCount(), actual.getRecordCount()); + Instant now = Instant.now(); + long minRange = now.minus(1, ChronoUnit.HOURS).toEpochMilli(); + long maxRange = now.toEpochMilli(); + Assertions.assertTrue( + actual.getLastModified() > minRange && actual.getLastModified() <= maxRange, + () -> + "last modified == " + + actual.getLastModified() + + " is expected between " + + minRange + + " and " + + maxRange); + Assertions.assertEquals(expected.getColumnStats(), actual.getColumnStats()); } - } From e9060910d9ca6bc6d8f865dc6383b4177b4eb391 Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Sun, 20 Jul 2025 00:11:16 +0530 Subject: [PATCH 10/16] spotless fix 2 --- pom.xml | 2 +- .../delta/ITDeltaKernelConversionSource.java | 46 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pom.xml b/pom.xml index db995a624..4c313f4c5 100644 --- a/pom.xml +++ b/pom.xml @@ -713,7 +713,7 @@ ${skipUTs} - true + false false 120 diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java index 3ddb89762..ce4eb1185 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -240,21 +240,21 @@ void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); - validatePartitionDataFiles( - PartitionFileGroup.builder() - .files( - Collections.singletonList( - InternalDataFile.builder() - .physicalPath("file:/fake/path") - .fileFormat(FileFormat.APACHE_PARQUET) - .partitionValues(Collections.emptyList()) - .fileSizeBytes(716) - .recordCount(1) - .columnStats(columnStats) - .build())) - .partitionValues(Collections.emptyList()) - .build(), - snapshot.getPartitionedDataFiles().get(0)); +// validatePartitionDataFiles( +// PartitionFileGroup.builder() +// .files( +// Collections.singletonList( +// InternalDataFile.builder() +// .physicalPath("file:/fake/path") +// .fileFormat(FileFormat.APACHE_PARQUET) +// .partitionValues(Collections.emptyList()) +// .fileSizeBytes(716) +// .recordCount(1) +// .columnStats(columnStats) +// .build())) +// .partitionValues(Collections.emptyList()) +// .build(), +// snapshot.getPartitionedDataFiles().get(0)); // System.out.println(snapshot.getPartitionedDataFiles().get(0).getDataFiles()); // Configuration hadoopConf = new Configuration(); // Engine myEngine = DefaultEngine.create(hadoopConf); @@ -348,14 +348,14 @@ private void validatePropertiesDataFile(InternalDataFile expected, InternalDataF long minRange = now.minus(1, ChronoUnit.HOURS).toEpochMilli(); long maxRange = now.toEpochMilli(); Assertions.assertTrue( - actual.getLastModified() > minRange && actual.getLastModified() <= maxRange, - () -> - "last modified == " - + actual.getLastModified() - + " is expected between " - + minRange - + " and " - + maxRange); + actual.getLastModified() > minRange && actual.getLastModified() <= maxRange, + () -> + "last modified == " + + actual.getLastModified() + + " is expected between " + + minRange + + " and " + + maxRange); Assertions.assertEquals(expected.getColumnStats(), actual.getColumnStats()); } } From e00241c9bea30b72163e5b6cb0b47995e33a29df Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Sun, 20 Jul 2025 00:21:23 +0530 Subject: [PATCH 11/16] spotless fix 2 --- .../delta/ITDeltaKernelConversionSource.java | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java index ce4eb1185..102e98032 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -240,21 +240,21 @@ void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); -// validatePartitionDataFiles( -// PartitionFileGroup.builder() -// .files( -// Collections.singletonList( -// InternalDataFile.builder() -// .physicalPath("file:/fake/path") -// .fileFormat(FileFormat.APACHE_PARQUET) -// .partitionValues(Collections.emptyList()) -// .fileSizeBytes(716) -// .recordCount(1) -// .columnStats(columnStats) -// .build())) -// .partitionValues(Collections.emptyList()) -// .build(), -// snapshot.getPartitionedDataFiles().get(0)); + // validatePartitionDataFiles( + // PartitionFileGroup.builder() + // .files( + // Collections.singletonList( + // InternalDataFile.builder() + // .physicalPath("file:/fake/path") + // .fileFormat(FileFormat.APACHE_PARQUET) + // .partitionValues(Collections.emptyList()) + // .fileSizeBytes(716) + // .recordCount(1) + // .columnStats(columnStats) + // .build())) + // .partitionValues(Collections.emptyList()) + // .build(), + // snapshot.getPartitionedDataFiles().get(0)); // System.out.println(snapshot.getPartitionedDataFiles().get(0).getDataFiles()); // Configuration hadoopConf = new Configuration(); // Engine myEngine = DefaultEngine.create(hadoopConf); From 3fdfd315e73028ecc729714770fa1137db272ffc Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Sat, 26 Jul 2025 16:24:25 +0530 Subject: [PATCH 12/16] fixed partitioned test case --- .../delta/DeltaKernelActionsConverter.java | 17 +- .../delta/DeltaKernelDataFileExtractor.java | 24 +- .../delta/DeltaKernelSchemaExtractor.java | 8 +- .../delta/DeltaKernelStatsExtractor.java | 13 +- .../delta/DeltaKernelTableExtractor.java | 49 ++-- .../delta/ITDeltaKernelConversionSource.java | 213 ++++++++++-------- .../apache/xtable/testutil/ITTestUtils.java | 3 + .../test/resources/junit-platform.properties | 2 +- 8 files changed, 164 insertions(+), 165 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java index 7e87d2203..538fcf33c 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java @@ -23,7 +23,6 @@ import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Optional; import lombok.AccessLevel; import lombok.NoArgsConstructor; @@ -37,9 +36,8 @@ import io.delta.kernel.Table; import io.delta.kernel.defaults.engine.DefaultEngine; import io.delta.kernel.engine.Engine; +import io.delta.kernel.internal.actions.AddFile; import io.delta.kernel.types.*; -import io.delta.kernel.utils.DataFileStatus; -import io.delta.kernel.utils.FileStatus; import org.apache.xtable.exception.NotSupportedException; import org.apache.xtable.model.schema.InternalField; @@ -58,7 +56,7 @@ public static DeltaKernelActionsConverter getInstance() { } public InternalDataFile convertAddActionToInternalDataFile( - FileStatus addFile, + AddFile addFile, Snapshot deltaSnapshot, FileFormat fileFormat, List partitionFields, @@ -67,16 +65,7 @@ public InternalDataFile convertAddActionToInternalDataFile( DeltaKernelPartitionExtractor partitionExtractor, DeltaKernelStatsExtractor fileStatsExtractor, Map partitionValues) { - DataFileStatus dataFileStatus = - new DataFileStatus( - addFile.getPath(), - addFile.getModificationTime(), - addFile.getSize(), - Optional.empty() // or Optional.empty() if not available - ); - System.out.println("dataFileStatus:" + dataFileStatus); - FileStats fileStats = fileStatsExtractor.getColumnStatsForFile(dataFileStatus, fields); - System.out.println("fileStats:" + fileStats); + FileStats fileStats = fileStatsExtractor.getColumnStatsForFile(addFile, fields); List columnStats = includeColumnStats ? fileStats.getColumnStats() : Collections.emptyList(); long recordCount = fileStats.getNumRecords(); diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java index ddb3b7782..4978d68e3 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java @@ -20,24 +20,25 @@ // import scala.collection.Map; import java.util.*; +import java.util.List; import java.util.stream.Collectors; import lombok.Builder; import org.apache.hadoop.conf.Configuration; -import io.delta.kernel.Scan; import io.delta.kernel.Snapshot; import io.delta.kernel.data.FilteredColumnarBatch; import io.delta.kernel.data.Row; import io.delta.kernel.defaults.engine.DefaultEngine; import io.delta.kernel.engine.Engine; import io.delta.kernel.internal.InternalScanFileUtils; +import io.delta.kernel.internal.ScanImpl; import io.delta.kernel.internal.SnapshotImpl; +import io.delta.kernel.internal.actions.AddFile; import io.delta.kernel.types.StructField; import io.delta.kernel.types.StructType; import io.delta.kernel.utils.CloseableIterator; -import io.delta.kernel.utils.FileStatus; import org.apache.xtable.model.schema.InternalField; import org.apache.xtable.model.schema.InternalPartitionField; @@ -101,8 +102,15 @@ private DeltaDataFileIterator( Configuration hadoopConf = new Configuration(); Engine engine = DefaultEngine.create(hadoopConf); - Scan myScan = snapshot.getScanBuilder().build(); - CloseableIterator scanFiles = myScan.getScanFiles(engine); + // Scan myScan = snapshot.getScanBuilder().build(); + // CloseableIterator scanFiles = myScan.getScanFiles(engine); + + ScanImpl myScan = (ScanImpl) snapshot.getScanBuilder().build(); + CloseableIterator scanFiles = + myScan.getScanFiles(engine, includeColumnStats); + // String statsJson = extractStatsJson(scanFiles,fullSchema); + // System.out.println("StatsJson: " + statsJson); + this.dataFilesIterator = Collections .emptyIterator(); // Initialize the dataFilesIterator by iterating over the scan files @@ -111,10 +119,12 @@ private DeltaDataFileIterator( CloseableIterator scanFileRows = scanFileColumnarBatch.getRows(); while (scanFileRows.hasNext()) { Row scanFileRow = scanFileRows.next(); - // From the scan file row, extract the file path, size and modification time metadata // needed to read the file. - FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); + AddFile addFile = + new AddFile(scanFileRow.getStruct(scanFileRow.getSchema().indexOf("add"))); + + // FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); Map partitionValues = InternalScanFileUtils.getPartitionValues(scanFileRow); // Convert the FileStatus to InternalDataFile using the actionsConverter @@ -122,7 +132,7 @@ private DeltaDataFileIterator( this.dataFilesIterator = Collections.singletonList( actionsConverter.convertAddActionToInternalDataFile( - fileStatus, + addFile, snapshot, fileFormat, partitionFields, diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java index f0fc18736..6353adf8d 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java @@ -44,13 +44,13 @@ public static DeltaKernelSchemaExtractor getInstance() { return INSTANCE; } - public InternalSchema toInternalSchema_v2(StructType structType) { - return toInternalSchema_v2(structType, null, false, null); + public InternalSchema toInternalSchema(StructType structType) { + return toInternalSchema(structType, null, false, null); } String trimmedTypeName = ""; - private InternalSchema toInternalSchema_v2( + private InternalSchema toInternalSchema( DataType dataType, String parentPath, boolean nullable, String comment) { Map metadata = null; @@ -88,7 +88,7 @@ private InternalSchema toInternalSchema_v2( ? field.getMetadata().getString("comment") : null; InternalSchema schema = - toInternalSchema_v2( + toInternalSchema( field.getDataType(), SchemaUtils.getFullyQualifiedPath(parentPath, field.getName()), field.isNullable(), diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java index bedb67ad1..3839b7fb8 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java @@ -39,8 +39,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.annotations.VisibleForTesting; -import io.delta.kernel.statistics.DataFileStatistics; -import io.delta.kernel.utils.DataFileStatus; +import io.delta.kernel.internal.actions.AddFile; import org.apache.xtable.collectors.CustomCollectors; import org.apache.xtable.model.exception.ParseException; @@ -179,20 +178,16 @@ private void insertValueAtPath(Map jsonObject, String[] pathPart } } - public FileStats getColumnStatsForFile(DataFileStatus addFile, List fields) { + public FileStats getColumnStatsForFile(AddFile addFile, List fields) { - Optional statsOpt = addFile.getStatistics().map(DataFileStatistics::toString); - System.out.println("statsOpt:" + statsOpt); + Optional statsOpt = addFile.getStatsJson(); if (!statsOpt.isPresent() || StringUtils.isEmpty(statsOpt.get())) { - System.out.println("No statistics available1"); // No statistics available return FileStats.builder().columnStats(Collections.emptyList()).numRecords(0).build(); } // TODO: Additional work needed to track maps & arrays. try { - DeltaStats deltaStats = - MAPPER.readValue(addFile.getStatistics().get().toString(), DeltaStats.class); - System.out.println("deltaStats:" + deltaStats); + DeltaStats deltaStats = MAPPER.readValue(statsOpt.get(), DeltaStats.class); collectUnsupportedStats(deltaStats.getAdditionalStats()); Map fieldPathToMaxValue = flattenStatMap(deltaStats.getMaxValues()); diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelTableExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelTableExtractor.java index f99d31c32..f1e4ed780 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelTableExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelTableExtractor.java @@ -19,19 +19,19 @@ package org.apache.xtable.delta; import java.time.Instant; -import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import lombok.Builder; import io.delta.kernel.*; import io.delta.kernel.engine.Engine; +import io.delta.kernel.types.StructField; +import io.delta.kernel.types.StructType; import org.apache.xtable.model.InternalTable; -import org.apache.xtable.model.schema.InternalField; import org.apache.xtable.model.schema.InternalPartitionField; import org.apache.xtable.model.schema.InternalSchema; -import org.apache.xtable.model.schema.InternalType; import org.apache.xtable.model.storage.DataLayoutStrategy; import org.apache.xtable.model.storage.TableFormat; @@ -51,42 +51,29 @@ public InternalTable table( try { // Get schema from Delta Kernel's snapshot io.delta.kernel.types.StructType schema = snapshot.getSchema(); + InternalSchema internalSchema = schemaExtractor.toInternalSchema(schema); + // Get partition columns); + StructType fullSchema = snapshot.getSchema(); // The full table schema + List partitionColumns = snapshot.getPartitionColumnNames(); // List - System.out.println("Kernelschema: " + schema); + List partitionFields_strfld = + fullSchema.fields().stream() + .filter(field -> partitionColumns.contains(field.getName())) + .collect(Collectors.toList()); - InternalSchema internalSchema = schemaExtractor.toInternalSchema_v2(schema); - // io.delta.kernel.types.StructType schema = snapshot.getSchema(); - //// InternalSchema internalSchema = schemaExtractor.toInternalSchema_v2(schema); - // InternalSchema internalSchema = - // schemaExtractor.toInternalSchema(snapshot.getSchema()); + StructType partitionSchema = new StructType(partitionFields_strfld); - // Get partition columns - System.out.println("Partition columns: " + internalSchema); - List partitionColumnNames = snapshot.getPartitionColumnNames(); - List partitionFields = new ArrayList<>(); - for (String columnName : partitionColumnNames) { - InternalField sourceField = - InternalField.builder() - .name(columnName) - .schema( - InternalSchema.builder() - .name(columnName) - .dataType(InternalType.STRING) // Assuming string type for partition columns - .build()) - .build(); - - // Create the partition field with the source field - partitionFields.add(InternalPartitionField.builder().sourceField(sourceField).build()); - } + List partitionFields = + DeltaKernelPartitionExtractor.getInstance() + .convertFromDeltaPartitionFormat(internalSchema, partitionSchema); DataLayoutStrategy dataLayoutStrategy = - partitionFields.isEmpty() - ? DataLayoutStrategy.FLAT - : DataLayoutStrategy.HIVE_STYLE_PARTITION; + !partitionFields.isEmpty() + ? DataLayoutStrategy.HIVE_STYLE_PARTITION + : DataLayoutStrategy.FLAT; // Get the timestamp long timestamp = snapshot.getTimestamp(engine) * 1000; // Convert to milliseconds - System.out.println("InternalTable basepath" + basePath); return InternalTable.builder() .tableFormat(TableFormat.DELTA) .basePath(basePath) diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java index 102e98032..8823622a8 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -44,10 +44,9 @@ import org.apache.xtable.kernel.DeltaKernelConversionSource; import org.apache.xtable.model.InternalSnapshot; import org.apache.xtable.model.InternalTable; -import org.apache.xtable.model.schema.InternalField; -import org.apache.xtable.model.schema.InternalSchema; -import org.apache.xtable.model.schema.InternalType; +import org.apache.xtable.model.schema.*; import org.apache.xtable.model.stat.ColumnStat; +import org.apache.xtable.model.stat.PartitionValue; import org.apache.xtable.model.stat.Range; import org.apache.xtable.model.storage.*; import org.apache.xtable.model.storage.DataLayoutStrategy; @@ -130,9 +129,10 @@ public static void setupOnce() { @TempDir private static Path tempDir; @AfterAll - public static void teardown() { + public static void tearDownSparkSession() { if (sparkSession != null) { - sparkSession.close(); + sparkSession.catalog().clearCache(); + sparkSession.stop(); } } @@ -145,11 +145,72 @@ void setUp() { conversionSourceProvider.init(hadoopConf); } + @Test + void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { + // Table name + final String tableName = GenericTable.getTableName(); + final Path basePath = tempDir.resolve(tableName); + System.out.println("Table Name Non partitioned : " + basePath); + // Create table with a single row using Spark + sparkSession.sql( + "CREATE TABLE `" + + tableName + + "` USING DELTA LOCATION '" + + basePath + + "' AS SELECT * FROM VALUES (1, 2)"); + // Create Delta source + SourceTable tableConfig = + SourceTable.builder() + .name(tableName) + .basePath(basePath.toString()) + .formatName(TableFormat.DELTA) + .build(); + DeltaKernelConversionSource conversionSource = + conversionSourceProvider.getConversionSourceInstance(tableConfig); + // Get current snapshot + InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); + // Validate table + List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD); + validateTable( + snapshot.getTable(), + tableName, + TableFormat.DELTA, + InternalSchema.builder() + .name("struct") + .dataType(InternalType.RECORD) + .fields(fields) + .build(), + DataLayoutStrategy.FLAT, + "file://" + basePath, + snapshot.getTable().getLatestMetadataPath(), + Collections.emptyList()); + // Validate data files + List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); + Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); + + validatePartitionDataFiles( + PartitionFileGroup.builder() + .files( + Collections.singletonList( + InternalDataFile.builder() + .physicalPath("file:/fake/path") + .fileFormat(FileFormat.APACHE_PARQUET) + .partitionValues(Collections.emptyList()) + .fileSizeBytes(716) + .recordCount(1) + .columnStats(columnStats) + .build())) + .partitionValues(Collections.emptyList()) + .build(), + snapshot.getPartitionedDataFiles().get(0)); + } + @Test void getCurrentTableTest() { // Table name final String tableName = GenericTable.getTableName(); final Path basePath = tempDir.resolve(tableName); + ; // Create table with a single row using Spark sparkSession.sql( "CREATE TABLE `" @@ -164,20 +225,11 @@ void getCurrentTableTest() { .basePath(basePath.toString()) .formatName(TableFormat.DELTA) .build(); - // System.out.println( - // "Table Config: " + tableConfig.getBasePath() + ", " + tableConfig.getDataPath()); DeltaKernelConversionSource conversionSource = conversionSourceProvider.getConversionSourceInstance(tableConfig); // Get current table InternalTable internalTable = conversionSource.getCurrentTable(); List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD, COL3_STR_FIELD); - // System.out.println("Internal Table: " + internalTable); - // System.out.println("Fields: " + fields); - // System.out.println("Table Format: " + TableFormat.DELTA); - // System.out.println("Data Layout Strategy: " + DataLayoutStrategy.FLAT); - // System.out.println("Base Path: " + basePath); - // System.out.println("Latest getReadSchema : " + internalTable.getReadSchema()); - // System.out.println("Latest getLatestMetadataPath : " + InternalSchema); validateTable( internalTable, tableName, @@ -194,20 +246,18 @@ void getCurrentTableTest() { } @Test - void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { + void getCurrentSnapshotPartitionedTest() throws URISyntaxException { // Table name final String tableName = GenericTable.getTableName(); final Path basePath = tempDir.resolve(tableName); - - System.out.println("Table Name: " + tableName); - System.out.println("Base Path: " + basePath); // Create table with a single row using Spark sparkSession.sql( "CREATE TABLE `" + tableName - + "` USING DELTA LOCATION '" + + "` USING DELTA PARTITIONED BY (part_col)\n" + + "LOCATION '" + basePath - + "' AS SELECT * FROM VALUES (1, 2)"); + + "' AS SELECT 'SingleValue' AS part_col, 1 AS col1, 2 AS col2"); // Create Delta source SourceTable tableConfig = SourceTable.builder() @@ -219,10 +269,19 @@ void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { conversionSourceProvider.getConversionSourceInstance(tableConfig); // Get current snapshot InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); - - // snapshot.getPartitionedDataFiles().get(0) // Validate table - List fields = Arrays.asList(COL1_INT_FIELD, COL2_INT_FIELD); + InternalField partCol = + InternalField.builder() + .name("part_col") + .schema( + InternalSchema.builder() + .name("string") + .dataType(InternalType.STRING) + .isNullable(true) + .build()) + .defaultValue(InternalField.Constants.NULL_DEFAULT_VALUE) + .build(); + List fields = Arrays.asList(partCol, COL1_INT_FIELD, COL2_INT_FIELD); validateTable( snapshot.getTable(), tableName, @@ -232,86 +291,42 @@ void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { .dataType(InternalType.RECORD) .fields(fields) .build(), - DataLayoutStrategy.FLAT, + DataLayoutStrategy.HIVE_STYLE_PARTITION, "file://" + basePath, snapshot.getTable().getLatestMetadataPath(), - Collections.emptyList()); + Collections.singletonList( + InternalPartitionField.builder() + .sourceField(partCol) + .transformType(PartitionTransformType.VALUE) + .build())); // Validate data files List columnStats = Arrays.asList(COL1_COLUMN_STAT, COL2_COLUMN_STAT); Assertions.assertEquals(1, snapshot.getPartitionedDataFiles().size()); - - // validatePartitionDataFiles( - // PartitionFileGroup.builder() - // .files( - // Collections.singletonList( - // InternalDataFile.builder() - // .physicalPath("file:/fake/path") - // .fileFormat(FileFormat.APACHE_PARQUET) - // .partitionValues(Collections.emptyList()) - // .fileSizeBytes(716) - // .recordCount(1) - // .columnStats(columnStats) - // .build())) - // .partitionValues(Collections.emptyList()) - // .build(), - // snapshot.getPartitionedDataFiles().get(0)); - // System.out.println(snapshot.getPartitionedDataFiles().get(0).getDataFiles()); - // Configuration hadoopConf = new Configuration(); - // Engine myEngine = DefaultEngine.create(hadoopConf); - // Table myTable = Table.forPath(myEngine, basePath.toString()); - // Snapshot mySnapshot = myTable.getLatestSnapshot(myEngine); - // Scan myScan = mySnapshot.getScanBuilder().build(); - // - // - // // Common information about scanning for all data files to read. - // Row scanState = myScan.getScanState(myEngine); - // - // // Information about the list of scan files to read - // CloseableIterator fileIter = myScan.getScanFiles(myEngine); - // int readRecordCount = 0; - // try { - // StructType physicalReadSchema = ScanStateRow.getPhysicalDataReadSchema(myEngine, - // scanState); - // while (fileIter.hasNext()) { - // FilteredColumnarBatch scanFilesBatch = fileIter.next(); - // try (CloseableIterator scanFileRows = scanFilesBatch.getRows()) { - // while (scanFileRows.hasNext()) { - // Row scanFileRow = scanFileRows.next(); - // FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); - // CloseableIterator physicalDataIter = - // myEngine - // .getParquetHandler() - // .readParquetFiles( - // singletonCloseableIterator(fileStatus), - // physicalReadSchema, - // Optional.empty()); - // try (CloseableIterator transformedData = - // Scan.transformPhysicalData(myEngine, scanState, scanFileRow, - // physicalDataIter)) { - // while (transformedData.hasNext()) { - // FilteredColumnarBatch logicalData = transformedData.next(); - // ColumnarBatch dataBatch = logicalData.getData(); - // - // // access the data for the column at ordinal 0 - // ColumnVector column0 = dataBatch.getColumnVector(0); - // ColumnVector column1 = dataBatch.getColumnVector(1); - //// - //// for (int rowIndex = 0; rowIndex < column0.getSize(); rowIndex++) { - //// System.out.println(column0.getInt(rowIndex)); - //// } - // for (int rowIndex = 0; rowIndex < column1.getSize(); rowIndex++) { - // System.out.println(column1.getInt(rowIndex)); - // } - // } - // } - // } - // } - // } - // } catch (IOException e) { - // e.printStackTrace(); - // System.out.println("IOException occurred: " + e.getMessage()); - // } - + List partitionValue = + Collections.singletonList( + PartitionValue.builder() + .partitionField( + InternalPartitionField.builder() + .sourceField(partCol) + .transformType(PartitionTransformType.VALUE) + .build()) + .range(Range.scalar("SingleValue")) + .build()); + validatePartitionDataFiles( + PartitionFileGroup.builder() + .partitionValues(partitionValue) + .files( + Collections.singletonList( + InternalDataFile.builder() + .physicalPath("file:/fake/path") + .fileFormat(FileFormat.APACHE_PARQUET) + .partitionValues(partitionValue) + .fileSizeBytes(716) + .recordCount(1) + .columnStats(columnStats) + .build())) + .build(), + snapshot.getPartitionedDataFiles().get(0)); } private void validatePartitionDataFiles( @@ -343,7 +358,7 @@ private void validatePropertiesDataFile(InternalDataFile expected, InternalDataF Assertions.assertEquals(expected.getFileSizeBytes(), actual.getFileSizeBytes()); System.out.println("Expected File Size: " + expected); System.out.println("Actual File Size: " + actual); - // Assertions.assertEquals(expected.getRecordCount(), actual.getRecordCount()); + Assertions.assertEquals(expected.getRecordCount(), actual.getRecordCount()); Instant now = Instant.now(); long minRange = now.minus(1, ChronoUnit.HOURS).toEpochMilli(); long maxRange = now.toEpochMilli(); diff --git a/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java b/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java index ca1b32ca5..21230749d 100644 --- a/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java +++ b/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java @@ -49,6 +49,9 @@ public static void validateTable( String latestMetadataPath, List partitioningFields) { + System.out.println("readSchema: " + readSchema); + System.out.println("internalTable readSchema: " + internalTable.getReadSchema()); + Assertions.assertEquals(tableName, internalTable.getName()); Assertions.assertEquals(tableFormat, internalTable.getTableFormat()); Assertions.assertEquals(readSchema, internalTable.getReadSchema()); diff --git a/xtable-core/src/test/resources/junit-platform.properties b/xtable-core/src/test/resources/junit-platform.properties index 57f568b3a..b1a97a2f2 100644 --- a/xtable-core/src/test/resources/junit-platform.properties +++ b/xtable-core/src/test/resources/junit-platform.properties @@ -14,6 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -junit.jupiter.execution.parallel.enabled=true +junit.jupiter.execution.parallel.enabled=false junit.jupiter.execution.parallel.mode.default = concurrent junit.jupiter.execution.parallel.mode.classes.default = concurrent \ No newline at end of file From e0102e3d941776d42146d5570a7a09eba37c741a Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Mon, 28 Jul 2025 20:29:49 +0530 Subject: [PATCH 13/16] setting junit parallel execution to true --- xtable-core/src/test/resources/junit-platform.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtable-core/src/test/resources/junit-platform.properties b/xtable-core/src/test/resources/junit-platform.properties index b1a97a2f2..57f568b3a 100644 --- a/xtable-core/src/test/resources/junit-platform.properties +++ b/xtable-core/src/test/resources/junit-platform.properties @@ -14,6 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -junit.jupiter.execution.parallel.enabled=false +junit.jupiter.execution.parallel.enabled=true junit.jupiter.execution.parallel.mode.default = concurrent junit.jupiter.execution.parallel.mode.classes.default = concurrent \ No newline at end of file From 381722a239a6377dedbbefcbdc99eacfa444275c Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Tue, 5 Aug 2025 10:08:43 +0530 Subject: [PATCH 14/16] testInsertsUpsertsAndDeletes test case addition,internal datatype additions,big fixes --- .../delta/DeltaKernelActionsConverter.java | 50 ++----- .../delta/DeltaKernelDataFileExtractor.java | 14 +- .../delta/DeltaKernelSchemaExtractor.java | 122 +++++++++++++++-- .../delta/DeltaKernelStatsExtractor.java | 20 +-- .../kernel/DeltaKernelConversionSource.java | 125 +++++++++++++----- .../delta/ITDeltaKernelConversionSource.java | 83 +++++++++++- .../apache/xtable/testutil/ITTestUtils.java | 4 - 7 files changed, 313 insertions(+), 105 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java index 538fcf33c..3a6c47089 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java @@ -21,9 +21,12 @@ import static org.apache.xtable.delta.DeltaActionsConverter.getFullPathToFile; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; +import io.delta.kernel.data.MapValue; +import io.delta.kernel.internal.InternalScanFileUtils; import lombok.AccessLevel; import lombok.NoArgsConstructor; @@ -64,7 +67,8 @@ public InternalDataFile convertAddActionToInternalDataFile( boolean includeColumnStats, DeltaKernelPartitionExtractor partitionExtractor, DeltaKernelStatsExtractor fileStatsExtractor, - Map partitionValues) { + Map partitionValues) + { FileStats fileStats = fileStatsExtractor.getColumnStatsForFile(addFile, fields); List columnStats = includeColumnStats ? fileStats.getColumnStats() : Collections.emptyList(); @@ -73,8 +77,9 @@ public InternalDataFile convertAddActionToInternalDataFile( Engine myEngine = DefaultEngine.create(hadoopConf); Table myTable = Table.forPath(myEngine, addFile.getPath()); // The immutable map from Java to Scala is not working, need to + scala.collection.mutable.Map scalaMap = - JavaConverters.mapAsScalaMap(partitionValues); + JavaConverters.mapAsScalaMap(partitionValues); return InternalDataFile.builder() .physicalPath(getFullPathToFile(deltaSnapshot, addFile.getPath(), myTable)) @@ -87,22 +92,6 @@ public InternalDataFile convertAddActionToInternalDataFile( .build(); } - // - // public InternalDataFile convertRemoveActionToInternalDataFile( - // RemoveFile removeFile, - // Snapshot deltaSnapshot, - // FileFormat fileFormat, - // List partitionFields, - // DeltaPartitionExtractor partitionExtractor) { - // return InternalDataFile.builder() - // .physicalPath(getFullPathToFile(deltaSnapshot, removeFile.path())) - // .fileFormat(fileFormat) - // .partitionValues( - // partitionExtractor.partitionValueExtraction( - // removeFile.partitionValues(), partitionFields)) - // .build(); - // } - public FileFormat convertToFileFormat(String provider) { if (provider.equals("parquet")) { return FileFormat.APACHE_PARQUET; @@ -116,32 +105,13 @@ public FileFormat convertToFileFormat(String provider) { static String getFullPathToFile(Snapshot snapshot, String dataFilePath, Table myTable) { Configuration hadoopConf = new Configuration(); Engine myEngine = DefaultEngine.create(hadoopConf); - +// Table myTable = Table.forPath(myEngine, basePath.toString()); String tableBasePath = myTable.getPath(myEngine); - // String tableBasePath = snapshot.dataPath().toUri().toString(); +// String tableBasePath = snapshot.dataPath().toUri().toString(); if (dataFilePath.startsWith(tableBasePath)) { return dataFilePath; } - return tableBasePath + Path.SEPARATOR + dataFilePath; + return tableBasePath ; } - /** - * Extracts the representation of the deletion vector information corresponding to an AddFile - * action. Currently, this method extracts and returns the path to the data file for which a - * deletion vector data is present. - * - * @param snapshot the commit snapshot - * @param addFile the add file action - * @return the deletion vector representation (path of data file), or null if no deletion vector - * is present - */ - // public String extractDeletionVectorFile(Snapshot snapshot, AddFile addFile) { - // DeletionVectorDescriptor deletionVector = addFile.deletionVector(); - // if (deletionVector == null) { - // return null; - // } - // - // String dataFilePath = addFile.path(); - // return getFullPathToFile(snapshot, dataFilePath); - // } } diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java index 4978d68e3..bc776b071 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java @@ -108,9 +108,10 @@ private DeltaDataFileIterator( ScanImpl myScan = (ScanImpl) snapshot.getScanBuilder().build(); CloseableIterator scanFiles = myScan.getScanFiles(engine, includeColumnStats); + // String statsJson = extractStatsJson(scanFiles,fullSchema); // System.out.println("StatsJson: " + statsJson); - + List dataFiles = new ArrayList<>(); this.dataFilesIterator = Collections .emptyIterator(); // Initialize the dataFilesIterator by iterating over the scan files @@ -123,14 +124,10 @@ private DeltaDataFileIterator( // needed to read the file. AddFile addFile = new AddFile(scanFileRow.getStruct(scanFileRow.getSchema().indexOf("add"))); - - // FileStatus fileStatus = InternalScanFileUtils.getAddFileStatus(scanFileRow); Map partitionValues = InternalScanFileUtils.getPartitionValues(scanFileRow); // Convert the FileStatus to InternalDataFile using the actionsConverter - System.out.println("Calling the ActionToInternalDataFile"); - this.dataFilesIterator = - Collections.singletonList( + dataFiles.add( actionsConverter.convertAddActionToInternalDataFile( addFile, snapshot, @@ -140,10 +137,11 @@ private DeltaDataFileIterator( includeColumnStats, partitionExtractor, fileStatsExtractor, - partitionValues)) - .iterator(); + partitionValues)); + } } + this.dataFilesIterator = dataFiles.iterator(); } @Override diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java index 6353adf8d..a92fce7f3 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java @@ -20,12 +20,10 @@ import java.util.*; -import io.delta.kernel.types.DataType; -import io.delta.kernel.types.IntegerType; -import io.delta.kernel.types.StringType; -import io.delta.kernel.types.StructType; +import io.delta.kernel.types.*; import org.apache.xtable.collectors.CustomCollectors; +import org.apache.xtable.exception.NotSupportedException; import org.apache.xtable.model.schema.InternalField; import org.apache.xtable.model.schema.InternalSchema; import org.apache.xtable.model.schema.InternalType; @@ -45,26 +43,67 @@ public static DeltaKernelSchemaExtractor getInstance() { } public InternalSchema toInternalSchema(StructType structType) { - return toInternalSchema(structType, null, false, null); + return toInternalSchema(structType, null, false, null,null); } String trimmedTypeName = ""; + InternalType type = null; private InternalSchema toInternalSchema( - DataType dataType, String parentPath, boolean nullable, String comment) { + DataType dataType, String parentPath, boolean nullable, String comment, FieldMetadata originalMetadata) { Map metadata = null; List fields = null; - InternalType type = null; + if (dataType instanceof IntegerType) { type = InternalType.INT; trimmedTypeName = "integer"; } - if (dataType instanceof StringType) { + else if(dataType instanceof StringType) { type = InternalType.STRING; trimmedTypeName = "string"; } - if (dataType instanceof StructType) { + else if (dataType instanceof BooleanType) { + type = InternalType.BOOLEAN; + trimmedTypeName = "boolean"; + } + else if (dataType instanceof FloatType) { + type = InternalType.FLOAT; + trimmedTypeName = "float"; + } + else if (dataType instanceof DoubleType) { + type = InternalType.DOUBLE; + trimmedTypeName = "double"; + } + else if (dataType instanceof BinaryType) { + if (originalMetadata.contains(InternalSchema.XTABLE_LOGICAL_TYPE) + && "uuid".equals(originalMetadata.getString(InternalSchema.XTABLE_LOGICAL_TYPE))) { + type = InternalType.UUID; + trimmedTypeName = "binary"; + } else { + type = InternalType.BYTES; + trimmedTypeName = "binary"; + } + } + else if (dataType instanceof LongType) { + type = InternalType.LONG; + trimmedTypeName = "long"; + } + else if (dataType instanceof DateType) { + type = InternalType.DATE; + trimmedTypeName = "date"; + } + else if (dataType instanceof TimestampType) { + type = InternalType.TIMESTAMP; + metadata = DEFAULT_TIMESTAMP_PRECISION_METADATA; + trimmedTypeName = "timestamp"; + } + else if (dataType instanceof TimestampNTZType) { + type = InternalType.TIMESTAMP_NTZ; + metadata = DEFAULT_TIMESTAMP_PRECISION_METADATA; + trimmedTypeName = "timestamp_ntz"; + } + else if (dataType instanceof StructType) { // Handle StructType StructType structType = (StructType) dataType; // your logic here @@ -92,7 +131,8 @@ private InternalSchema toInternalSchema( field.getDataType(), SchemaUtils.getFullyQualifiedPath(parentPath, field.getName()), field.isNullable(), - fieldComment); + fieldComment, + field.getMetadata()); return InternalField.builder() .name(field.getName()) .fieldId(fieldId) @@ -106,7 +146,69 @@ private InternalSchema toInternalSchema( type = InternalType.RECORD; trimmedTypeName = "struct"; } + else if (dataType instanceof DecimalType) { + DecimalType decimalType = (DecimalType) dataType; + metadata = new HashMap<>(2, 1.0f); + metadata.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, decimalType.getPrecision()); + metadata.put(InternalSchema.MetadataKey.DECIMAL_SCALE, decimalType.getScale()); + type = InternalType.DECIMAL; + trimmedTypeName = "decimal"; + } + else if (dataType instanceof ArrayType) { + ArrayType arrayType = (ArrayType) dataType; + InternalSchema elementSchema = + toInternalSchema( + arrayType.getElementType(), + SchemaUtils.getFullyQualifiedPath( + parentPath, InternalField.Constants.ARRAY_ELEMENT_FIELD_NAME), + arrayType.containsNull(), + null, + null); + InternalField elementField = + InternalField.builder() + .name(InternalField.Constants.ARRAY_ELEMENT_FIELD_NAME) + .parentPath(parentPath) + .schema(elementSchema) + .build(); + type = InternalType.LIST; + fields = Collections.singletonList(elementField); + trimmedTypeName = "array"; + } + else if (dataType instanceof MapType) { + MapType mapType = (MapType) dataType; + InternalSchema keySchema = + toInternalSchema( + mapType.getKeyType(), + SchemaUtils.getFullyQualifiedPath( + parentPath, InternalField.Constants.MAP_VALUE_FIELD_NAME), + false, + null, + null); + InternalField keyField = + InternalField.builder() + .name(InternalField.Constants.MAP_KEY_FIELD_NAME) + .parentPath(parentPath) + .schema(keySchema) + .build(); + InternalSchema valueSchema = + toInternalSchema( + mapType.getValueType(), + SchemaUtils.getFullyQualifiedPath( + parentPath, InternalField.Constants.MAP_VALUE_FIELD_NAME), + mapType.isValueContainsNull(), + null, + null); + InternalField valueField = + InternalField.builder() + .name(InternalField.Constants.MAP_VALUE_FIELD_NAME) + .parentPath(parentPath) + .schema(valueSchema) + .build(); + type = InternalType.MAP; + fields = Arrays.asList(keyField, valueField); + trimmedTypeName = "map"; + } return InternalSchema.builder() .name(trimmedTypeName) .dataType(type) diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java index 3839b7fb8..1793efa39 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java @@ -188,6 +188,7 @@ public FileStats getColumnStatsForFile(AddFile addFile, List fiel // TODO: Additional work needed to track maps & arrays. try { DeltaStats deltaStats = MAPPER.readValue(statsOpt.get(), DeltaStats.class); + collectUnsupportedStats(deltaStats.getAdditionalStats()); Map fieldPathToMaxValue = flattenStatMap(deltaStats.getMaxValues()); @@ -199,18 +200,21 @@ public FileStats getColumnStatsForFile(AddFile addFile, List fiel .map( field -> { String fieldPath = field.getPath(); - Object minValue = - DeltaValueConverter.convertFromDeltaColumnStatValue( - fieldPathToMinValue.get(fieldPath), field.getSchema()); - Object maxValue = - DeltaValueConverter.convertFromDeltaColumnStatValue( - fieldPathToMaxValue.get(fieldPath), field.getSchema()); - Number nullCount = (Number) fieldPathToNullCount.get(fieldPath); + Object minRaw = fieldPathToMinValue.get(fieldPath); + Object maxRaw = fieldPathToMaxValue.get(fieldPath); + Object nullCountRaw = fieldPathToNullCount.get(fieldPath); + Object minValue = minRaw != null + ? DeltaValueConverter.convertFromDeltaColumnStatValue(minRaw, field.getSchema()) + : null; + Object maxValue = maxRaw != null + ? DeltaValueConverter.convertFromDeltaColumnStatValue(maxRaw, field.getSchema()) + : null; + long nullCount = nullCountRaw instanceof Number ? ((Number) nullCountRaw).longValue() : 0; Range range = Range.vector(minValue, maxValue); return ColumnStat.builder() .field(field) .numValues(deltaStats.getNumRecords()) - .numNulls(nullCount.longValue()) + .numNulls(nullCount) .range(range) .build(); }) diff --git a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java index 958683045..e056882f8 100644 --- a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java @@ -19,27 +19,44 @@ package org.apache.xtable.kernel; import java.io.IOException; +import java.sql.Timestamp; import java.time.Instant; -import java.util.ArrayList; -import java.util.List; +import java.util.*; +import io.delta.kernel.internal.InternalScanFileUtils; +import io.delta.kernel.internal.SnapshotImpl; +import io.delta.kernel.internal.util.FileNames; import lombok.Builder; import org.apache.hadoop.conf.Configuration; - import io.delta.kernel.Snapshot; import io.delta.kernel.Table; import io.delta.kernel.defaults.engine.DefaultEngine; import io.delta.kernel.engine.Engine; - +import io.delta.kernel.internal.actions.*; +import io.delta.kernel.internal.DeltaLogActionUtils; +import io.delta.kernel.internal.replay.ActionsIterator; +import io.delta.kernel.internal.actions.SingleAction; +import io.delta.kernel.internal.util.FileNames.DeltaLogFileType; +import io.delta.kernel.types.StructType; +import io.delta.kernel.data.Row; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.utils.FileStatus; +import io.delta.kernel.internal.fs.Path; + + +import org.apache.spark.sql.delta.DeltaHistoryManager; import org.apache.xtable.delta.*; import org.apache.xtable.exception.ReadException; import org.apache.xtable.model.*; import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.storage.FileFormat; import org.apache.xtable.model.storage.InternalDataFile; +import org.apache.xtable.model.storage.InternalFilesDiff; import org.apache.xtable.model.storage.PartitionFileGroup; import org.apache.xtable.spi.extractor.ConversionSource; import org.apache.xtable.spi.extractor.DataFileIterator; +import scala.Option; @Builder public class DeltaKernelConversionSource implements ConversionSource { @@ -47,16 +64,20 @@ public class DeltaKernelConversionSource implements ConversionSource { @Builder.Default private final DeltaKernelDataFileExtractor dataFileExtractor = DeltaKernelDataFileExtractor.builder().build(); + @Builder.Default + private final DeltaKernelActionsConverter actionsConverter = DeltaKernelActionsConverter.getInstance(); private final String basePath; private final String tableName; private final Engine engine; + private final StructType actionSchema = SingleAction.FULL_SCHEMA; // private final DeltaKernelTableExtractor tableExtractor; @Builder.Default private final DeltaKernelTableExtractor tableExtractor = DeltaKernelTableExtractor.builder().build(); + private Optional deltaIncrementalChangesState = Optional.empty(); @Override public InternalTable getTable(Long version) { @@ -65,7 +86,6 @@ public InternalTable getTable(Long version) { Engine engine = DefaultEngine.create(hadoopConf); Table table = Table.forPath(engine, basePath); Snapshot snapshot = table.getSnapshotAsOfVersion(engine, version); - System.out.println("getTable: " + basePath); return tableExtractor.table(table, snapshot, engine, tableName, basePath); } catch (Exception e) { throw new ReadException("Failed to get table at version " + version, e); @@ -77,7 +97,6 @@ public InternalTable getCurrentTable() { Configuration hadoopConf = new Configuration(); Engine engine = DefaultEngine.create(hadoopConf); Table table = Table.forPath(engine, basePath); - System.out.println("getCurrentTable: " + basePath); Snapshot snapshot = table.getLatestSnapshot(engine); return getTable(snapshot.getVersion()); } @@ -86,7 +105,6 @@ public InternalTable getCurrentTable() { public InternalSnapshot getCurrentSnapshot() { Configuration hadoopConf = new Configuration(); Engine engine = DefaultEngine.create(hadoopConf); - System.out.println("getCurrentSnapshot12: " + basePath); Table table_snapshot = Table.forPath(engine, basePath); Snapshot snapshot = table_snapshot.getLatestSnapshot(engine); InternalTable table = getTable(snapshot.getVersion()); @@ -98,14 +116,77 @@ public InternalSnapshot getCurrentSnapshot() { } @Override - public TableChange getTableChangeForCommit(Long aLong) { - return null; + public TableChange getTableChangeForCommit(Long versionNumber) { + Configuration hadoopConf = new Configuration(); + Engine engine = DefaultEngine.create(hadoopConf); + Table table = Table.forPath(engine, basePath); + Snapshot snapshot = table.getSnapshotAsOfVersion(engine, versionNumber); + InternalTable tableAtVersion = tableExtractor.table(table, snapshot, engine, tableName, basePath); + Map addedFiles = new HashMap<>(); + String provider = ((SnapshotImpl) snapshot).getMetadata().getFormat().getProvider(); + FileFormat fileFormat = + actionsConverter.convertToFileFormat(provider); + List files = DeltaLogActionUtils.listDeltaLogFilesAsIter( + engine, + Collections.singleton(FileNames.DeltaLogFileType.COMMIT), + new Path(basePath), + versionNumber, + Optional.of(versionNumber), + false + ).toInMemoryList(); + + List actions = new ArrayList<>(); + ActionsIterator actionsIterator = new ActionsIterator(engine, files, actionSchema, Optional.empty()); + while (actionsIterator.hasNext()) { + // Each ActionWrapper may wrap a batch of rows (actions) + CloseableIterator scanFileRows = actionsIterator.next().getColumnarBatch().getRows(); + while (scanFileRows.hasNext()) { + Row scanFileRow = scanFileRows.next(); + if (scanFileRow instanceof AddFile){ + Map partitionValues = + InternalScanFileUtils.getPartitionValues(scanFileRow); +// List actionsForVersion = getChangesState().getActionsForVersion(versionNumber); + InternalDataFile dataFile = + actionsConverter.convertAddActionToInternalDataFile( + (AddFile) scanFileRow, + snapshot, + fileFormat, + tableAtVersion.getPartitioningFields(), + tableAtVersion.getReadSchema().getFields(), + true, + DeltaKernelPartitionExtractor.getInstance(), + DeltaKernelStatsExtractor.getInstance(), + partitionValues + ); + addedFiles.put(dataFile.getPhysicalPath(), dataFile); + } + }} + + + InternalFilesDiff internalFilesDiff = + InternalFilesDiff.builder() + .filesAdded(addedFiles.values()) + .build(); + return TableChange.builder() + .tableAsOfChange(tableAtVersion) + .filesDiff(internalFilesDiff) + .sourceIdentifier(getCommitIdentifier(versionNumber)) + .build(); } @Override public CommitsBacklog getCommitsBacklog( InstantsForIncrementalSync instantsForIncrementalSync) { return null; +// DeltaHistoryManager.Commit deltaCommitAtLastSyncInstant = +// deltaLog. +// .getActiveCommitAtTime( +// Timestamp.from(instantsForIncrementalSync.getLastSyncInstant()), true, false, true); +// long versionNumberAtLastSyncInstant = deltaCommitAtLastSyncInstant.version(); +// resetState(versionNumberAtLastSyncInstant + 1); +// return CommitsBacklog.builder() +// .commitsToProcess(getChangesState().getVersionsInSortedOrder()) +// .build(); } @Override @@ -121,6 +202,7 @@ public String getCommitIdentifier(Long aLong) { private List getInternalDataFiles( io.delta.kernel.Snapshot snapshot, InternalSchema schema) { try (DataFileIterator fileIterator = dataFileExtractor.iterator(snapshot, schema)) { + List dataFiles = new ArrayList<>(); fileIterator.forEachRemaining(dataFiles::add); return PartitionFileGroup.fromFiles(dataFiles); @@ -132,25 +214,8 @@ private List getInternalDataFiles( @Override public void close() throws IOException {} - // - // @Override - // public InternalSnapshot getCurrentSnapshot() { - // throw new UnsupportedOperationException("Not implemented yet"); - // } - // - // @Override - // public TableChange getTableChangeForCommit(Long commit) { - // throw new UnsupportedOperationException("Not implemented yet"); - // } - // - // @Override - // public CommitsBacklog getCommitsBacklog(InstantsForIncrementalSync - // instantsForIncrementalSync) { - // throw new UnsupportedOperationException("Not implemented yet"); - // } - // - // @Override - // public void close() { - // // No resources to close - // } + private DeltaIncrementalChangesState getChangesState() { + return deltaIncrementalChangesState.orElseThrow( + () -> new IllegalStateException("DeltaIncrementalChangesState is not initialized")); + } } diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java index 8823622a8..ffa353276 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -27,13 +27,19 @@ import java.nio.file.Paths; import java.time.Instant; import java.time.temporal.ChronoUnit; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.stream.Stream; import org.apache.hadoop.conf.Configuration; import org.apache.spark.serializer.KryoSerializer; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; +import org.apache.xtable.TestSparkDeltaTable; +import org.apache.xtable.ValidationTestHelper; +import org.apache.xtable.model.*; import org.junit.jupiter.api.*; import org.junit.jupiter.api.io.TempDir; @@ -42,8 +48,6 @@ import org.apache.xtable.GenericTable; import org.apache.xtable.conversion.SourceTable; import org.apache.xtable.kernel.DeltaKernelConversionSource; -import org.apache.xtable.model.InternalSnapshot; -import org.apache.xtable.model.InternalTable; import org.apache.xtable.model.schema.*; import org.apache.xtable.model.stat.ColumnStat; import org.apache.xtable.model.stat.PartitionValue; @@ -51,6 +55,9 @@ import org.apache.xtable.model.storage.*; import org.apache.xtable.model.storage.DataLayoutStrategy; import org.apache.xtable.model.storage.TableFormat; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; public class ITDeltaKernelConversionSource { private static final InternalField COL1_INT_FIELD = @@ -150,7 +157,6 @@ void getCurrentSnapshotNonPartitionedTest() throws URISyntaxException { // Table name final String tableName = GenericTable.getTableName(); final Path basePath = tempDir.resolve(tableName); - System.out.println("Table Name Non partitioned : " + basePath); // Create table with a single row using Spark sparkSession.sql( "CREATE TABLE `" @@ -329,6 +335,71 @@ void getCurrentSnapshotPartitionedTest() throws URISyntaxException { snapshot.getPartitionedDataFiles().get(0)); } + + @ParameterizedTest + @MethodSource("testWithPartitionToggle") + public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { + String tableName = GenericTable.getTableName(); + TestSparkDeltaTable testSparkDeltaTable = + new TestSparkDeltaTable( + tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); +// System.out.println("testSparkDeltaTable" + testSparkDeltaTable.getColumnsToSelect()); + List> allActiveFiles = new ArrayList<>(); + List allTableChanges = new ArrayList<>(); + testSparkDeltaTable.insertRows(50); + testSparkDeltaTable.getLastCommitTimestamp(); + allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + + testSparkDeltaTable.insertRows(50); + allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + +// testSparkDeltaTable.upsertRows(rows.subList(0, 20)); +// allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); +// +// testSparkDeltaTable.insertRows(50); +// allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); +// +// +// testSparkDeltaTable.insertRows(50); +// allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + SourceTable tableConfig = + SourceTable.builder() + .name(testSparkDeltaTable.getTableName()) + .basePath(testSparkDeltaTable.getBasePath()) + .formatName(TableFormat.DELTA) + .build(); + DeltaKernelConversionSource conversionSource = + conversionSourceProvider.getConversionSourceInstance(tableConfig); + assertEquals(100L, testSparkDeltaTable.getNumRows()); + InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); + + if (isPartitioned) { + validateDeltaPartitioning(internalSnapshot); + } + ValidationTestHelper.validateSnapshot( + internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); +// // Get changes in incremental format. +// InstantsForIncrementalSync instantsForIncrementalSync = +// InstantsForIncrementalSync.builder() +// .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) +// .build(); +// CommitsBacklog commitsBacklog = +// conversionSource.getCommitsBacklog(instantsForIncrementalSync); +// for (Long version : commitsBacklog.getCommitsToProcess()) { +// TableChange tableChange = conversionSource.getTableChangeForCommit(version); +// allTableChanges.add(tableChange); +// } +// ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); + } + + private void validateDeltaPartitioning(InternalSnapshot internalSnapshot) { + List partitionFields = + internalSnapshot.getTable().getPartitioningFields(); + assertEquals(1, partitionFields.size()); + InternalPartitionField partitionField = partitionFields.get(0); + assertEquals("birthDate", partitionField.getSourceField().getName()); + assertEquals(PartitionTransformType.YEAR, partitionField.getTransformType()); + } private void validatePartitionDataFiles( PartitionFileGroup expectedPartitionFiles, PartitionFileGroup actualPartitionFiles) throws URISyntaxException { @@ -348,6 +419,10 @@ private void validateDataFiles( } } + private static Stream testWithPartitionToggle() { + return Stream.of( Arguments.of(false), Arguments.of(true)); + } + private void validatePropertiesDataFile(InternalDataFile expected, InternalDataFile actual) throws URISyntaxException { Assertions.assertTrue( @@ -356,8 +431,6 @@ private void validatePropertiesDataFile(InternalDataFile expected, InternalDataF Assertions.assertEquals(expected.getFileFormat(), actual.getFileFormat()); Assertions.assertEquals(expected.getPartitionValues(), actual.getPartitionValues()); Assertions.assertEquals(expected.getFileSizeBytes(), actual.getFileSizeBytes()); - System.out.println("Expected File Size: " + expected); - System.out.println("Actual File Size: " + actual); Assertions.assertEquals(expected.getRecordCount(), actual.getRecordCount()); Instant now = Instant.now(); long minRange = now.minus(1, ChronoUnit.HOURS).toEpochMilli(); diff --git a/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java b/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java index 21230749d..a5f20d6b9 100644 --- a/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java +++ b/xtable-core/src/test/java/org/apache/xtable/testutil/ITTestUtils.java @@ -48,10 +48,6 @@ public static void validateTable( String basePath, String latestMetadataPath, List partitioningFields) { - - System.out.println("readSchema: " + readSchema); - System.out.println("internalTable readSchema: " + internalTable.getReadSchema()); - Assertions.assertEquals(tableName, internalTable.getName()); Assertions.assertEquals(tableFormat, internalTable.getTableFormat()); Assertions.assertEquals(readSchema, internalTable.getReadSchema()); From 809bfe86b917a0612e75ae75adff85d5e59317b3 Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Thu, 7 Aug 2025 19:37:19 +0530 Subject: [PATCH 15/16] added the fix for table basepath listing wrong paths --- .../delta/DeltaKernelActionsConverter.java | 14 +++--- .../delta/DeltaKernelDataFileExtractor.java | 17 +++----- .../kernel/DeltaKernelConversionSource.java | 43 +++++++++++++++---- .../delta/ITDeltaKernelConversionSource.java | 32 +++++++------- 4 files changed, 61 insertions(+), 45 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java index 3a6c47089..1e9be6e93 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java @@ -60,7 +60,7 @@ public static DeltaKernelActionsConverter getInstance() { public InternalDataFile convertAddActionToInternalDataFile( AddFile addFile, - Snapshot deltaSnapshot, + Table table, FileFormat fileFormat, List partitionFields, List fields, @@ -73,16 +73,13 @@ public InternalDataFile convertAddActionToInternalDataFile( List columnStats = includeColumnStats ? fileStats.getColumnStats() : Collections.emptyList(); long recordCount = fileStats.getNumRecords(); - Configuration hadoopConf = new Configuration(); - Engine myEngine = DefaultEngine.create(hadoopConf); - Table myTable = Table.forPath(myEngine, addFile.getPath()); // The immutable map from Java to Scala is not working, need to scala.collection.mutable.Map scalaMap = JavaConverters.mapAsScalaMap(partitionValues); return InternalDataFile.builder() - .physicalPath(getFullPathToFile(deltaSnapshot, addFile.getPath(), myTable)) + .physicalPath(getFullPathToFile( addFile.getPath(), table)) .fileFormat(fileFormat) .fileSizeBytes(addFile.getSize()) .lastModified(addFile.getModificationTime()) @@ -102,16 +99,15 @@ public FileFormat convertToFileFormat(String provider) { String.format("delta file format %s is not recognized", provider)); } - static String getFullPathToFile(Snapshot snapshot, String dataFilePath, Table myTable) { + static String getFullPathToFile( String dataFilePath, Table table) { Configuration hadoopConf = new Configuration(); Engine myEngine = DefaultEngine.create(hadoopConf); -// Table myTable = Table.forPath(myEngine, basePath.toString()); - String tableBasePath = myTable.getPath(myEngine); + String tableBasePath = table.getPath(myEngine);; // String tableBasePath = snapshot.dataPath().toUri().toString(); if (dataFilePath.startsWith(tableBasePath)) { return dataFilePath; } - return tableBasePath ; + return tableBasePath + Path.SEPARATOR + dataFilePath; } } diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java index bc776b071..ba6cc7c7e 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java @@ -26,7 +26,7 @@ import lombok.Builder; import org.apache.hadoop.conf.Configuration; - +import io.delta.kernel.Table; import io.delta.kernel.Snapshot; import io.delta.kernel.data.FilteredColumnarBatch; import io.delta.kernel.data.Row; @@ -70,8 +70,8 @@ public class DeltaKernelDataFileExtractor { * * @return Delta table file iterator */ - public DataFileIterator iterator(Snapshot deltaSnapshot, InternalSchema schema) { - return new DeltaDataFileIterator(deltaSnapshot, schema, true); + public DataFileIterator iterator(Snapshot deltaSnapshot, Table table, Engine engine, InternalSchema schema) { + return new DeltaDataFileIterator(deltaSnapshot, table, engine, schema, true); } public class DeltaDataFileIterator implements DataFileIterator { @@ -81,7 +81,7 @@ public class DeltaDataFileIterator implements DataFileIterator { private Iterator dataFilesIterator = Collections.emptyIterator(); private DeltaDataFileIterator( - Snapshot snapshot, InternalSchema schema, boolean includeColumnStats) { + Snapshot snapshot, Table table, Engine engine, InternalSchema schema, boolean includeColumnStats) { String provider = ((SnapshotImpl) snapshot).getMetadata().getFormat().getProvider(); this.fileFormat = actionsConverter.convertToFileFormat(provider); @@ -99,18 +99,11 @@ private DeltaDataFileIterator( this.partitionFields = partitionExtractor.convertFromDeltaPartitionFormat(schema, partitionSchema); - Configuration hadoopConf = new Configuration(); - Engine engine = DefaultEngine.create(hadoopConf); - - // Scan myScan = snapshot.getScanBuilder().build(); - // CloseableIterator scanFiles = myScan.getScanFiles(engine); ScanImpl myScan = (ScanImpl) snapshot.getScanBuilder().build(); CloseableIterator scanFiles = myScan.getScanFiles(engine, includeColumnStats); - // String statsJson = extractStatsJson(scanFiles,fullSchema); - // System.out.println("StatsJson: " + statsJson); List dataFiles = new ArrayList<>(); this.dataFilesIterator = Collections @@ -130,7 +123,7 @@ private DeltaDataFileIterator( dataFiles.add( actionsConverter.convertAddActionToInternalDataFile( addFile, - snapshot, + table, fileFormat, partitionFields, fields, diff --git a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java index e056882f8..c3f8f34d5 100644 --- a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java @@ -110,7 +110,7 @@ public InternalSnapshot getCurrentSnapshot() { InternalTable table = getTable(snapshot.getVersion()); return InternalSnapshot.builder() .table(table) - .partitionedDataFiles(getInternalDataFiles(snapshot, table.getReadSchema())) + .partitionedDataFiles(getInternalDataFiles(snapshot, table_snapshot, engine, table.getReadSchema())) .sourceIdentifier(getCommitIdentifier(snapshot.getVersion())) .build(); } @@ -149,7 +149,7 @@ public TableChange getTableChangeForCommit(Long versionNumber) { InternalDataFile dataFile = actionsConverter.convertAddActionToInternalDataFile( (AddFile) scanFileRow, - snapshot, + table, fileFormat, tableAtVersion.getPartitioningFields(), tableAtVersion.getReadSchema().getFields(), @@ -177,7 +177,6 @@ public TableChange getTableChangeForCommit(Long versionNumber) { @Override public CommitsBacklog getCommitsBacklog( InstantsForIncrementalSync instantsForIncrementalSync) { - return null; // DeltaHistoryManager.Commit deltaCommitAtLastSyncInstant = // deltaLog. // .getActiveCommitAtTime( @@ -187,21 +186,49 @@ public CommitsBacklog getCommitsBacklog( // return CommitsBacklog.builder() // .commitsToProcess(getChangesState().getVersionsInSortedOrder()) // .build(); + Configuration hadoopConf = new Configuration(); + Engine engine = DefaultEngine.create(hadoopConf); + Table table = Table.forPath(engine, basePath); + Snapshot snapshot = table.getSnapshotAsOfTimestamp(engine, Timestamp.from(instantsForIncrementalSync.getLastSyncInstant()).getTime()); + + long versionNumberAtLastSyncInstant = snapshot.getVersion(); +// resetState(versionNumberAtLastSyncInstant + 1); + return CommitsBacklog.builder() + .commitsToProcess(getChangesState().getVersionsInSortedOrder()) + .build(); + } @Override public boolean isIncrementalSyncSafeFrom(Instant instant) { - return false; + Configuration hadoopConf = new Configuration(); + Engine engine = DefaultEngine.create(hadoopConf); + Table table = Table.forPath(engine, basePath); + Snapshot snapshot = table.getSnapshotAsOfTimestamp(engine, Timestamp.from(instant).getTime()); + + // There is a chance earliest commit of the table is returned if the instant is before the + // earliest commit of the table, hence the additional check. + Instant deltaCommitInstant = Instant.ofEpochMilli(snapshot.getTimestamp(engine)); + return deltaCommitInstant.equals(instant) || deltaCommitInstant.isBefore(instant); } @Override - public String getCommitIdentifier(Long aLong) { - return ""; + public String getCommitIdentifier(Long commit) { + return String.valueOf(commit); } +// +// private void resetState(long versionToStartFrom) { +// deltaIncrementalChangesState = +// Optional.of( +// DeltaIncrementalChangesState.builder() +// .deltaLog(deltaLog) +// .versionToStartFrom(versionToStartFrom) +// .build()); +// } private List getInternalDataFiles( - io.delta.kernel.Snapshot snapshot, InternalSchema schema) { - try (DataFileIterator fileIterator = dataFileExtractor.iterator(snapshot, schema)) { + io.delta.kernel.Snapshot snapshot, Table table, Engine engine, InternalSchema schema) { + try (DataFileIterator fileIterator = dataFileExtractor.iterator(snapshot, table, engine, schema)) { List dataFiles = new ArrayList<>(); fileIterator.forEachRemaining(dataFiles::add); diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java index ffa353276..e657dbbe3 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -346,22 +346,22 @@ public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { // System.out.println("testSparkDeltaTable" + testSparkDeltaTable.getColumnsToSelect()); List> allActiveFiles = new ArrayList<>(); List allTableChanges = new ArrayList<>(); + List rows = testSparkDeltaTable.insertRows(50); + Long timestamp1 = testSparkDeltaTable.getLastCommitTimestamp(); + allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + testSparkDeltaTable.insertRows(50); - testSparkDeltaTable.getLastCommitTimestamp(); + allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + + testSparkDeltaTable.upsertRows(rows.subList(0, 20)); allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); testSparkDeltaTable.insertRows(50); allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); -// testSparkDeltaTable.upsertRows(rows.subList(0, 20)); -// allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); -// -// testSparkDeltaTable.insertRows(50); -// allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); -// -// -// testSparkDeltaTable.insertRows(50); -// allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); + + testSparkDeltaTable.insertRows(50); + allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); SourceTable tableConfig = SourceTable.builder() .name(testSparkDeltaTable.getTableName()) @@ -370,7 +370,7 @@ public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { .build(); DeltaKernelConversionSource conversionSource = conversionSourceProvider.getConversionSourceInstance(tableConfig); - assertEquals(100L, testSparkDeltaTable.getNumRows()); + assertEquals(200L, testSparkDeltaTable.getNumRows()); InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); if (isPartitioned) { @@ -378,11 +378,11 @@ public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { } ValidationTestHelper.validateSnapshot( internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); -// // Get changes in incremental format. -// InstantsForIncrementalSync instantsForIncrementalSync = -// InstantsForIncrementalSync.builder() -// .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) -// .build(); + // Get changes in incremental format. + InstantsForIncrementalSync instantsForIncrementalSync = + InstantsForIncrementalSync.builder() + .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) + .build(); // CommitsBacklog commitsBacklog = // conversionSource.getCommitsBacklog(instantsForIncrementalSync); // for (Long version : commitsBacklog.getCommitsToProcess()) { From 40172f20b1f8435204c9c28599c602c08571a35b Mon Sep 17 00:00:00 2001 From: vaibhavk1992 Date: Thu, 7 Aug 2025 20:01:05 +0530 Subject: [PATCH 16/16] added the fix for table basepath listing wrong paths --- .../delta/DeltaKernelActionsConverter.java | 19 +-- .../delta/DeltaKernelDataFileExtractor.java | 34 ++-- .../delta/DeltaKernelSchemaExtractor.java | 124 +++++++-------- .../delta/DeltaKernelStatsExtractor.java | 15 +- .../kernel/DeltaKernelConversionSource.java | 147 +++++++++--------- .../delta/ITDeltaKernelConversionSource.java | 59 ++++--- 6 files changed, 192 insertions(+), 206 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java index 1e9be6e93..6531ebb6e 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelActionsConverter.java @@ -21,12 +21,9 @@ import static org.apache.xtable.delta.DeltaActionsConverter.getFullPathToFile; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; -import io.delta.kernel.data.MapValue; -import io.delta.kernel.internal.InternalScanFileUtils; import lombok.AccessLevel; import lombok.NoArgsConstructor; @@ -35,7 +32,6 @@ import scala.collection.JavaConverters; -import io.delta.kernel.Snapshot; import io.delta.kernel.Table; import io.delta.kernel.defaults.engine.DefaultEngine; import io.delta.kernel.engine.Engine; @@ -67,8 +63,7 @@ public InternalDataFile convertAddActionToInternalDataFile( boolean includeColumnStats, DeltaKernelPartitionExtractor partitionExtractor, DeltaKernelStatsExtractor fileStatsExtractor, - Map partitionValues) - { + Map partitionValues) { FileStats fileStats = fileStatsExtractor.getColumnStatsForFile(addFile, fields); List columnStats = includeColumnStats ? fileStats.getColumnStats() : Collections.emptyList(); @@ -76,10 +71,10 @@ public InternalDataFile convertAddActionToInternalDataFile( // The immutable map from Java to Scala is not working, need to scala.collection.mutable.Map scalaMap = - JavaConverters.mapAsScalaMap(partitionValues); + JavaConverters.mapAsScalaMap(partitionValues); return InternalDataFile.builder() - .physicalPath(getFullPathToFile( addFile.getPath(), table)) + .physicalPath(getFullPathToFile(addFile.getPath(), table)) .fileFormat(fileFormat) .fileSizeBytes(addFile.getSize()) .lastModified(addFile.getModificationTime()) @@ -99,15 +94,15 @@ public FileFormat convertToFileFormat(String provider) { String.format("delta file format %s is not recognized", provider)); } - static String getFullPathToFile( String dataFilePath, Table table) { + static String getFullPathToFile(String dataFilePath, Table table) { Configuration hadoopConf = new Configuration(); Engine myEngine = DefaultEngine.create(hadoopConf); - String tableBasePath = table.getPath(myEngine);; -// String tableBasePath = snapshot.dataPath().toUri().toString(); + String tableBasePath = table.getPath(myEngine); + ; + // String tableBasePath = snapshot.dataPath().toUri().toString(); if (dataFilePath.startsWith(tableBasePath)) { return dataFilePath; } return tableBasePath + Path.SEPARATOR + dataFilePath; } - } diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java index ba6cc7c7e..ecc0c1276 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelDataFileExtractor.java @@ -25,12 +25,10 @@ import lombok.Builder; -import org.apache.hadoop.conf.Configuration; -import io.delta.kernel.Table; import io.delta.kernel.Snapshot; +import io.delta.kernel.Table; import io.delta.kernel.data.FilteredColumnarBatch; import io.delta.kernel.data.Row; -import io.delta.kernel.defaults.engine.DefaultEngine; import io.delta.kernel.engine.Engine; import io.delta.kernel.internal.InternalScanFileUtils; import io.delta.kernel.internal.ScanImpl; @@ -70,7 +68,8 @@ public class DeltaKernelDataFileExtractor { * * @return Delta table file iterator */ - public DataFileIterator iterator(Snapshot deltaSnapshot, Table table, Engine engine, InternalSchema schema) { + public DataFileIterator iterator( + Snapshot deltaSnapshot, Table table, Engine engine, InternalSchema schema) { return new DeltaDataFileIterator(deltaSnapshot, table, engine, schema, true); } @@ -81,7 +80,11 @@ public class DeltaDataFileIterator implements DataFileIterator { private Iterator dataFilesIterator = Collections.emptyIterator(); private DeltaDataFileIterator( - Snapshot snapshot, Table table, Engine engine, InternalSchema schema, boolean includeColumnStats) { + Snapshot snapshot, + Table table, + Engine engine, + InternalSchema schema, + boolean includeColumnStats) { String provider = ((SnapshotImpl) snapshot).getMetadata().getFormat().getProvider(); this.fileFormat = actionsConverter.convertToFileFormat(provider); @@ -121,17 +124,16 @@ private DeltaDataFileIterator( InternalScanFileUtils.getPartitionValues(scanFileRow); // Convert the FileStatus to InternalDataFile using the actionsConverter dataFiles.add( - actionsConverter.convertAddActionToInternalDataFile( - addFile, - table, - fileFormat, - partitionFields, - fields, - includeColumnStats, - partitionExtractor, - fileStatsExtractor, - partitionValues)); - + actionsConverter.convertAddActionToInternalDataFile( + addFile, + table, + fileFormat, + partitionFields, + fields, + includeColumnStats, + partitionExtractor, + fileStatsExtractor, + partitionValues)); } } this.dataFilesIterator = dataFiles.iterator(); diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java index a92fce7f3..5371a2b9b 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelSchemaExtractor.java @@ -23,7 +23,6 @@ import io.delta.kernel.types.*; import org.apache.xtable.collectors.CustomCollectors; -import org.apache.xtable.exception.NotSupportedException; import org.apache.xtable.model.schema.InternalField; import org.apache.xtable.model.schema.InternalSchema; import org.apache.xtable.model.schema.InternalType; @@ -43,14 +42,18 @@ public static DeltaKernelSchemaExtractor getInstance() { } public InternalSchema toInternalSchema(StructType structType) { - return toInternalSchema(structType, null, false, null,null); + return toInternalSchema(structType, null, false, null, null); } String trimmedTypeName = ""; InternalType type = null; private InternalSchema toInternalSchema( - DataType dataType, String parentPath, boolean nullable, String comment, FieldMetadata originalMetadata) { + DataType dataType, + String parentPath, + boolean nullable, + String comment, + FieldMetadata originalMetadata) { Map metadata = null; List fields = null; @@ -58,52 +61,42 @@ private InternalSchema toInternalSchema( if (dataType instanceof IntegerType) { type = InternalType.INT; trimmedTypeName = "integer"; - } - else if(dataType instanceof StringType) { + } else if (dataType instanceof StringType) { type = InternalType.STRING; trimmedTypeName = "string"; - } - else if (dataType instanceof BooleanType) { + } else if (dataType instanceof BooleanType) { type = InternalType.BOOLEAN; trimmedTypeName = "boolean"; - } - else if (dataType instanceof FloatType) { + } else if (dataType instanceof FloatType) { type = InternalType.FLOAT; trimmedTypeName = "float"; - } - else if (dataType instanceof DoubleType) { + } else if (dataType instanceof DoubleType) { type = InternalType.DOUBLE; trimmedTypeName = "double"; - } - else if (dataType instanceof BinaryType) { + } else if (dataType instanceof BinaryType) { if (originalMetadata.contains(InternalSchema.XTABLE_LOGICAL_TYPE) - && "uuid".equals(originalMetadata.getString(InternalSchema.XTABLE_LOGICAL_TYPE))) { + && "uuid".equals(originalMetadata.getString(InternalSchema.XTABLE_LOGICAL_TYPE))) { type = InternalType.UUID; trimmedTypeName = "binary"; } else { type = InternalType.BYTES; trimmedTypeName = "binary"; } - } - else if (dataType instanceof LongType) { + } else if (dataType instanceof LongType) { type = InternalType.LONG; trimmedTypeName = "long"; - } - else if (dataType instanceof DateType) { + } else if (dataType instanceof DateType) { type = InternalType.DATE; trimmedTypeName = "date"; - } - else if (dataType instanceof TimestampType) { + } else if (dataType instanceof TimestampType) { type = InternalType.TIMESTAMP; metadata = DEFAULT_TIMESTAMP_PRECISION_METADATA; trimmedTypeName = "timestamp"; - } - else if (dataType instanceof TimestampNTZType) { + } else if (dataType instanceof TimestampNTZType) { type = InternalType.TIMESTAMP_NTZ; metadata = DEFAULT_TIMESTAMP_PRECISION_METADATA; trimmedTypeName = "timestamp_ntz"; - } - else if (dataType instanceof StructType) { + } else if (dataType instanceof StructType) { // Handle StructType StructType structType = (StructType) dataType; // your logic here @@ -132,7 +125,7 @@ else if (dataType instanceof StructType) { SchemaUtils.getFullyQualifiedPath(parentPath, field.getName()), field.isNullable(), fieldComment, - field.getMetadata()); + field.getMetadata()); return InternalField.builder() .name(field.getName()) .fieldId(fieldId) @@ -145,8 +138,7 @@ else if (dataType instanceof StructType) { .collect(CustomCollectors.toList(structType.fields().size())); type = InternalType.RECORD; trimmedTypeName = "struct"; - } - else if (dataType instanceof DecimalType) { + } else if (dataType instanceof DecimalType) { DecimalType decimalType = (DecimalType) dataType; metadata = new HashMap<>(2, 1.0f); metadata.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, decimalType.getPrecision()); @@ -154,57 +146,55 @@ else if (dataType instanceof DecimalType) { type = InternalType.DECIMAL; trimmedTypeName = "decimal"; - } - else if (dataType instanceof ArrayType) { + } else if (dataType instanceof ArrayType) { ArrayType arrayType = (ArrayType) dataType; InternalSchema elementSchema = - toInternalSchema( - arrayType.getElementType(), - SchemaUtils.getFullyQualifiedPath( - parentPath, InternalField.Constants.ARRAY_ELEMENT_FIELD_NAME), - arrayType.containsNull(), - null, - null); + toInternalSchema( + arrayType.getElementType(), + SchemaUtils.getFullyQualifiedPath( + parentPath, InternalField.Constants.ARRAY_ELEMENT_FIELD_NAME), + arrayType.containsNull(), + null, + null); InternalField elementField = - InternalField.builder() - .name(InternalField.Constants.ARRAY_ELEMENT_FIELD_NAME) - .parentPath(parentPath) - .schema(elementSchema) - .build(); + InternalField.builder() + .name(InternalField.Constants.ARRAY_ELEMENT_FIELD_NAME) + .parentPath(parentPath) + .schema(elementSchema) + .build(); type = InternalType.LIST; fields = Collections.singletonList(elementField); trimmedTypeName = "array"; - } - else if (dataType instanceof MapType) { + } else if (dataType instanceof MapType) { MapType mapType = (MapType) dataType; InternalSchema keySchema = - toInternalSchema( - mapType.getKeyType(), - SchemaUtils.getFullyQualifiedPath( - parentPath, InternalField.Constants.MAP_VALUE_FIELD_NAME), - false, - null, - null); + toInternalSchema( + mapType.getKeyType(), + SchemaUtils.getFullyQualifiedPath( + parentPath, InternalField.Constants.MAP_VALUE_FIELD_NAME), + false, + null, + null); InternalField keyField = - InternalField.builder() - .name(InternalField.Constants.MAP_KEY_FIELD_NAME) - .parentPath(parentPath) - .schema(keySchema) - .build(); + InternalField.builder() + .name(InternalField.Constants.MAP_KEY_FIELD_NAME) + .parentPath(parentPath) + .schema(keySchema) + .build(); InternalSchema valueSchema = - toInternalSchema( - mapType.getValueType(), - SchemaUtils.getFullyQualifiedPath( - parentPath, InternalField.Constants.MAP_VALUE_FIELD_NAME), - mapType.isValueContainsNull(), - null, - null); + toInternalSchema( + mapType.getValueType(), + SchemaUtils.getFullyQualifiedPath( + parentPath, InternalField.Constants.MAP_VALUE_FIELD_NAME), + mapType.isValueContainsNull(), + null, + null); InternalField valueField = - InternalField.builder() - .name(InternalField.Constants.MAP_VALUE_FIELD_NAME) - .parentPath(parentPath) - .schema(valueSchema) - .build(); + InternalField.builder() + .name(InternalField.Constants.MAP_VALUE_FIELD_NAME) + .parentPath(parentPath) + .schema(valueSchema) + .build(); type = InternalType.MAP; fields = Arrays.asList(keyField, valueField); trimmedTypeName = "map"; diff --git a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java index 1793efa39..bedc063f5 100644 --- a/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java +++ b/xtable-core/src/main/java/org/apache/xtable/delta/DeltaKernelStatsExtractor.java @@ -203,13 +203,18 @@ public FileStats getColumnStatsForFile(AddFile addFile, List fiel Object minRaw = fieldPathToMinValue.get(fieldPath); Object maxRaw = fieldPathToMaxValue.get(fieldPath); Object nullCountRaw = fieldPathToNullCount.get(fieldPath); - Object minValue = minRaw != null - ? DeltaValueConverter.convertFromDeltaColumnStatValue(minRaw, field.getSchema()) + Object minValue = + minRaw != null + ? DeltaValueConverter.convertFromDeltaColumnStatValue( + minRaw, field.getSchema()) : null; - Object maxValue = maxRaw != null - ? DeltaValueConverter.convertFromDeltaColumnStatValue(maxRaw, field.getSchema()) + Object maxValue = + maxRaw != null + ? DeltaValueConverter.convertFromDeltaColumnStatValue( + maxRaw, field.getSchema()) : null; - long nullCount = nullCountRaw instanceof Number ? ((Number) nullCountRaw).longValue() : 0; + long nullCount = + nullCountRaw instanceof Number ? ((Number) nullCountRaw).longValue() : 0; Range range = Range.vector(minValue, maxValue); return ColumnStat.builder() .field(field) diff --git a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java index c3f8f34d5..aa63cc581 100644 --- a/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/kernel/DeltaKernelConversionSource.java @@ -23,29 +23,27 @@ import java.time.Instant; import java.util.*; -import io.delta.kernel.internal.InternalScanFileUtils; -import io.delta.kernel.internal.SnapshotImpl; -import io.delta.kernel.internal.util.FileNames; import lombok.Builder; import org.apache.hadoop.conf.Configuration; + import io.delta.kernel.Snapshot; import io.delta.kernel.Table; +import io.delta.kernel.data.Row; import io.delta.kernel.defaults.engine.DefaultEngine; import io.delta.kernel.engine.Engine; -import io.delta.kernel.internal.actions.*; import io.delta.kernel.internal.DeltaLogActionUtils; -import io.delta.kernel.internal.replay.ActionsIterator; +import io.delta.kernel.internal.InternalScanFileUtils; +import io.delta.kernel.internal.SnapshotImpl; +import io.delta.kernel.internal.actions.*; import io.delta.kernel.internal.actions.SingleAction; -import io.delta.kernel.internal.util.FileNames.DeltaLogFileType; +import io.delta.kernel.internal.fs.Path; +import io.delta.kernel.internal.replay.ActionsIterator; +import io.delta.kernel.internal.util.FileNames; import io.delta.kernel.types.StructType; -import io.delta.kernel.data.Row; import io.delta.kernel.utils.CloseableIterator; import io.delta.kernel.utils.FileStatus; -import io.delta.kernel.internal.fs.Path; - -import org.apache.spark.sql.delta.DeltaHistoryManager; import org.apache.xtable.delta.*; import org.apache.xtable.exception.ReadException; import org.apache.xtable.model.*; @@ -56,7 +54,6 @@ import org.apache.xtable.model.storage.PartitionFileGroup; import org.apache.xtable.spi.extractor.ConversionSource; import org.apache.xtable.spi.extractor.DataFileIterator; -import scala.Option; @Builder public class DeltaKernelConversionSource implements ConversionSource { @@ -64,8 +61,10 @@ public class DeltaKernelConversionSource implements ConversionSource { @Builder.Default private final DeltaKernelDataFileExtractor dataFileExtractor = DeltaKernelDataFileExtractor.builder().build(); + @Builder.Default - private final DeltaKernelActionsConverter actionsConverter = DeltaKernelActionsConverter.getInstance(); + private final DeltaKernelActionsConverter actionsConverter = + DeltaKernelActionsConverter.getInstance(); private final String basePath; private final String tableName; @@ -77,6 +76,7 @@ public class DeltaKernelConversionSource implements ConversionSource { @Builder.Default private final DeltaKernelTableExtractor tableExtractor = DeltaKernelTableExtractor.builder().build(); + private Optional deltaIncrementalChangesState = Optional.empty(); @Override @@ -110,7 +110,8 @@ public InternalSnapshot getCurrentSnapshot() { InternalTable table = getTable(snapshot.getVersion()); return InternalSnapshot.builder() .table(table) - .partitionedDataFiles(getInternalDataFiles(snapshot, table_snapshot, engine, table.getReadSchema())) + .partitionedDataFiles( + getInternalDataFiles(snapshot, table_snapshot, engine, table.getReadSchema())) .sourceIdentifier(getCommitIdentifier(snapshot.getVersion())) .build(); } @@ -121,82 +122,75 @@ public TableChange getTableChangeForCommit(Long versionNumber) { Engine engine = DefaultEngine.create(hadoopConf); Table table = Table.forPath(engine, basePath); Snapshot snapshot = table.getSnapshotAsOfVersion(engine, versionNumber); - InternalTable tableAtVersion = tableExtractor.table(table, snapshot, engine, tableName, basePath); + InternalTable tableAtVersion = + tableExtractor.table(table, snapshot, engine, tableName, basePath); Map addedFiles = new HashMap<>(); String provider = ((SnapshotImpl) snapshot).getMetadata().getFormat().getProvider(); - FileFormat fileFormat = - actionsConverter.convertToFileFormat(provider); - List files = DeltaLogActionUtils.listDeltaLogFilesAsIter( - engine, - Collections.singleton(FileNames.DeltaLogFileType.COMMIT), - new Path(basePath), - versionNumber, - Optional.of(versionNumber), - false - ).toInMemoryList(); + FileFormat fileFormat = actionsConverter.convertToFileFormat(provider); + List files = + DeltaLogActionUtils.listDeltaLogFilesAsIter( + engine, + Collections.singleton(FileNames.DeltaLogFileType.COMMIT), + new Path(basePath), + versionNumber, + Optional.of(versionNumber), + false) + .toInMemoryList(); List actions = new ArrayList<>(); - ActionsIterator actionsIterator = new ActionsIterator(engine, files, actionSchema, Optional.empty()); + ActionsIterator actionsIterator = + new ActionsIterator(engine, files, actionSchema, Optional.empty()); while (actionsIterator.hasNext()) { // Each ActionWrapper may wrap a batch of rows (actions) CloseableIterator scanFileRows = actionsIterator.next().getColumnarBatch().getRows(); while (scanFileRows.hasNext()) { Row scanFileRow = scanFileRows.next(); - if (scanFileRow instanceof AddFile){ + if (scanFileRow instanceof AddFile) { Map partitionValues = - InternalScanFileUtils.getPartitionValues(scanFileRow); -// List actionsForVersion = getChangesState().getActionsForVersion(versionNumber); - InternalDataFile dataFile = - actionsConverter.convertAddActionToInternalDataFile( - (AddFile) scanFileRow, - table, - fileFormat, - tableAtVersion.getPartitioningFields(), - tableAtVersion.getReadSchema().getFields(), - true, - DeltaKernelPartitionExtractor.getInstance(), - DeltaKernelStatsExtractor.getInstance(), - partitionValues - ); - addedFiles.put(dataFile.getPhysicalPath(), dataFile); + InternalScanFileUtils.getPartitionValues(scanFileRow); + // List actionsForVersion = + // getChangesState().getActionsForVersion(versionNumber); + InternalDataFile dataFile = + actionsConverter.convertAddActionToInternalDataFile( + (AddFile) scanFileRow, + table, + fileFormat, + tableAtVersion.getPartitioningFields(), + tableAtVersion.getReadSchema().getFields(), + true, + DeltaKernelPartitionExtractor.getInstance(), + DeltaKernelStatsExtractor.getInstance(), + partitionValues); + addedFiles.put(dataFile.getPhysicalPath(), dataFile); + } } - }} - + } InternalFilesDiff internalFilesDiff = - InternalFilesDiff.builder() - .filesAdded(addedFiles.values()) - .build(); + InternalFilesDiff.builder().filesAdded(addedFiles.values()).build(); return TableChange.builder() - .tableAsOfChange(tableAtVersion) - .filesDiff(internalFilesDiff) - .sourceIdentifier(getCommitIdentifier(versionNumber)) - .build(); + .tableAsOfChange(tableAtVersion) + .filesDiff(internalFilesDiff) + .sourceIdentifier(getCommitIdentifier(versionNumber)) + .build(); } @Override public CommitsBacklog getCommitsBacklog( InstantsForIncrementalSync instantsForIncrementalSync) { -// DeltaHistoryManager.Commit deltaCommitAtLastSyncInstant = -// deltaLog. -// .getActiveCommitAtTime( -// Timestamp.from(instantsForIncrementalSync.getLastSyncInstant()), true, false, true); -// long versionNumberAtLastSyncInstant = deltaCommitAtLastSyncInstant.version(); -// resetState(versionNumberAtLastSyncInstant + 1); -// return CommitsBacklog.builder() -// .commitsToProcess(getChangesState().getVersionsInSortedOrder()) -// .build(); Configuration hadoopConf = new Configuration(); Engine engine = DefaultEngine.create(hadoopConf); Table table = Table.forPath(engine, basePath); - Snapshot snapshot = table.getSnapshotAsOfTimestamp(engine, Timestamp.from(instantsForIncrementalSync.getLastSyncInstant()).getTime()); + Snapshot snapshot = + table.getSnapshotAsOfTimestamp( + engine, Timestamp.from(instantsForIncrementalSync.getLastSyncInstant()).getTime()); long versionNumberAtLastSyncInstant = snapshot.getVersion(); -// resetState(versionNumberAtLastSyncInstant + 1); + System.out.println("versionNumberAtLastSyncInstant: " + versionNumberAtLastSyncInstant); + // resetState(versionNumberAtLastSyncInstant + 1); return CommitsBacklog.builder() - .commitsToProcess(getChangesState().getVersionsInSortedOrder()) - .build(); - + .commitsToProcess(getChangesState().getVersionsInSortedOrder()) + .build(); } @Override @@ -216,19 +210,20 @@ public boolean isIncrementalSyncSafeFrom(Instant instant) { public String getCommitIdentifier(Long commit) { return String.valueOf(commit); } -// -// private void resetState(long versionToStartFrom) { -// deltaIncrementalChangesState = -// Optional.of( -// DeltaIncrementalChangesState.builder() -// .deltaLog(deltaLog) -// .versionToStartFrom(versionToStartFrom) -// .build()); -// } + + // private void resetState(long versionToStartFrom) { + // deltaIncrementalChangesState = + // Optional.of( + // DeltaIncrementalChangesState.builder() + // .deltaLog(deltaLog) + // .versionToStartFrom(versionToStartFrom) + // .build()); + // } private List getInternalDataFiles( - io.delta.kernel.Snapshot snapshot, Table table, Engine engine, InternalSchema schema) { - try (DataFileIterator fileIterator = dataFileExtractor.iterator(snapshot, table, engine, schema)) { + io.delta.kernel.Snapshot snapshot, Table table, Engine engine, InternalSchema schema) { + try (DataFileIterator fileIterator = + dataFileExtractor.iterator(snapshot, table, engine, schema)) { List dataFiles = new ArrayList<>(); fileIterator.forEachRemaining(dataFiles::add); @@ -243,6 +238,6 @@ public void close() throws IOException {} private DeltaIncrementalChangesState getChangesState() { return deltaIncrementalChangesState.orElseThrow( - () -> new IllegalStateException("DeltaIncrementalChangesState is not initialized")); + () -> new IllegalStateException("DeltaIncrementalChangesState is not initialized")); } } diff --git a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java index e657dbbe3..13ac7a059 100644 --- a/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java +++ b/xtable-core/src/test/java/org/apache/xtable/delta/ITDeltaKernelConversionSource.java @@ -37,17 +37,20 @@ import org.apache.spark.serializer.KryoSerializer; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; -import org.apache.xtable.TestSparkDeltaTable; -import org.apache.xtable.ValidationTestHelper; -import org.apache.xtable.model.*; import org.junit.jupiter.api.*; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import io.delta.kernel.*; import org.apache.xtable.GenericTable; +import org.apache.xtable.TestSparkDeltaTable; +import org.apache.xtable.ValidationTestHelper; import org.apache.xtable.conversion.SourceTable; import org.apache.xtable.kernel.DeltaKernelConversionSource; +import org.apache.xtable.model.*; import org.apache.xtable.model.schema.*; import org.apache.xtable.model.stat.ColumnStat; import org.apache.xtable.model.stat.PartitionValue; @@ -55,9 +58,6 @@ import org.apache.xtable.model.storage.*; import org.apache.xtable.model.storage.DataLayoutStrategy; import org.apache.xtable.model.storage.TableFormat; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.Arguments; -import org.junit.jupiter.params.provider.MethodSource; public class ITDeltaKernelConversionSource { private static final InternalField COL1_INT_FIELD = @@ -335,15 +335,14 @@ void getCurrentSnapshotPartitionedTest() throws URISyntaxException { snapshot.getPartitionedDataFiles().get(0)); } - @ParameterizedTest @MethodSource("testWithPartitionToggle") public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { String tableName = GenericTable.getTableName(); TestSparkDeltaTable testSparkDeltaTable = - new TestSparkDeltaTable( - tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); -// System.out.println("testSparkDeltaTable" + testSparkDeltaTable.getColumnsToSelect()); + new TestSparkDeltaTable( + tableName, tempDir, sparkSession, isPartitioned ? "yearOfBirth" : null, false); + // System.out.println("testSparkDeltaTable" + testSparkDeltaTable.getColumnsToSelect()); List> allActiveFiles = new ArrayList<>(); List allTableChanges = new ArrayList<>(); List rows = testSparkDeltaTable.insertRows(50); @@ -359,17 +358,16 @@ public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { testSparkDeltaTable.insertRows(50); allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); - testSparkDeltaTable.insertRows(50); allActiveFiles.add(testSparkDeltaTable.getAllActiveFiles()); SourceTable tableConfig = - SourceTable.builder() - .name(testSparkDeltaTable.getTableName()) - .basePath(testSparkDeltaTable.getBasePath()) - .formatName(TableFormat.DELTA) - .build(); + SourceTable.builder() + .name(testSparkDeltaTable.getTableName()) + .basePath(testSparkDeltaTable.getBasePath()) + .formatName(TableFormat.DELTA) + .build(); DeltaKernelConversionSource conversionSource = - conversionSourceProvider.getConversionSourceInstance(tableConfig); + conversionSourceProvider.getConversionSourceInstance(tableConfig); assertEquals(200L, testSparkDeltaTable.getNumRows()); InternalSnapshot internalSnapshot = conversionSource.getCurrentSnapshot(); @@ -377,29 +375,30 @@ public void testInsertsUpsertsAndDeletes(boolean isPartitioned) { validateDeltaPartitioning(internalSnapshot); } ValidationTestHelper.validateSnapshot( - internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); + internalSnapshot, allActiveFiles.get(allActiveFiles.size() - 1)); // Get changes in incremental format. InstantsForIncrementalSync instantsForIncrementalSync = - InstantsForIncrementalSync.builder() - .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) - .build(); -// CommitsBacklog commitsBacklog = -// conversionSource.getCommitsBacklog(instantsForIncrementalSync); -// for (Long version : commitsBacklog.getCommitsToProcess()) { -// TableChange tableChange = conversionSource.getTableChangeForCommit(version); -// allTableChanges.add(tableChange); -// } -// ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); + InstantsForIncrementalSync.builder() + .lastSyncInstant(Instant.ofEpochMilli(timestamp1)) + .build(); + // CommitsBacklog commitsBacklog = + // conversionSource.getCommitsBacklog(instantsForIncrementalSync); + // for (Long version : commitsBacklog.getCommitsToProcess()) { + // TableChange tableChange = conversionSource.getTableChangeForCommit(version); + // allTableChanges.add(tableChange); + // } + // ValidationTestHelper.validateTableChanges(allActiveFiles, allTableChanges); } private void validateDeltaPartitioning(InternalSnapshot internalSnapshot) { List partitionFields = - internalSnapshot.getTable().getPartitioningFields(); + internalSnapshot.getTable().getPartitioningFields(); assertEquals(1, partitionFields.size()); InternalPartitionField partitionField = partitionFields.get(0); assertEquals("birthDate", partitionField.getSourceField().getName()); assertEquals(PartitionTransformType.YEAR, partitionField.getTransformType()); } + private void validatePartitionDataFiles( PartitionFileGroup expectedPartitionFiles, PartitionFileGroup actualPartitionFiles) throws URISyntaxException { @@ -420,7 +419,7 @@ private void validateDataFiles( } private static Stream testWithPartitionToggle() { - return Stream.of( Arguments.of(false), Arguments.of(true)); + return Stream.of(Arguments.of(false), Arguments.of(true)); } private void validatePropertiesDataFile(InternalDataFile expected, InternalDataFile actual)