|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +package org.apache.comet.fuzz |
| 21 | + |
| 22 | +import java.io.File |
| 23 | + |
| 24 | +import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} |
| 25 | + |
| 26 | +import org.apache.spark.sql.{functions, SparkSession} |
| 27 | + |
| 28 | +class ComparisonToolConf(arguments: Seq[String]) extends ScallopConf(arguments) { |
| 29 | + object compareParquet extends Subcommand("compareParquet") { |
| 30 | + val inputSparkFolder: ScallopOption[String] = |
| 31 | + opt[String](required = true, descr = "Folder with Spark produced results in Parquet format") |
| 32 | + val inputCometFolder: ScallopOption[String] = |
| 33 | + opt[String](required = true, descr = "Folder with Comet produced results in Parquet format") |
| 34 | + } |
| 35 | + addSubcommand(compareParquet) |
| 36 | + verify() |
| 37 | +} |
| 38 | + |
| 39 | +object ComparisonTool { |
| 40 | + |
| 41 | + lazy val spark: SparkSession = SparkSession |
| 42 | + .builder() |
| 43 | + .getOrCreate() |
| 44 | + |
| 45 | + def main(args: Array[String]): Unit = { |
| 46 | + val conf = new ComparisonToolConf(args.toIndexedSeq) |
| 47 | + conf.subcommand match { |
| 48 | + case Some(conf.compareParquet) => |
| 49 | + compareParquetFolders( |
| 50 | + spark, |
| 51 | + conf.compareParquet.inputSparkFolder(), |
| 52 | + conf.compareParquet.inputCometFolder()) |
| 53 | + |
| 54 | + case _ => |
| 55 | + // scalastyle:off println |
| 56 | + println("Invalid subcommand") |
| 57 | + // scalastyle:on println |
| 58 | + sys.exit(-1) |
| 59 | + } |
| 60 | + } |
| 61 | + |
| 62 | + private def compareParquetFolders( |
| 63 | + spark: SparkSession, |
| 64 | + sparkFolderPath: String, |
| 65 | + cometFolderPath: String): Unit = { |
| 66 | + |
| 67 | + val output = QueryRunner.createOutputMdFile() |
| 68 | + |
| 69 | + try { |
| 70 | + val sparkFolder = new File(sparkFolderPath) |
| 71 | + val cometFolder = new File(cometFolderPath) |
| 72 | + |
| 73 | + if (!sparkFolder.exists() || !sparkFolder.isDirectory) { |
| 74 | + throw new IllegalArgumentException( |
| 75 | + s"Spark folder does not exist or is not a directory: $sparkFolderPath") |
| 76 | + } |
| 77 | + |
| 78 | + if (!cometFolder.exists() || !cometFolder.isDirectory) { |
| 79 | + throw new IllegalArgumentException( |
| 80 | + s"Comet folder does not exist or is not a directory: $cometFolderPath") |
| 81 | + } |
| 82 | + |
| 83 | + // Get all subdirectories from the Spark folder |
| 84 | + val sparkSubfolders = sparkFolder |
| 85 | + .listFiles() |
| 86 | + .filter(_.isDirectory) |
| 87 | + .map(_.getName) |
| 88 | + .sorted |
| 89 | + |
| 90 | + output.write("# Comparing Parquet Folders\n\n") |
| 91 | + output.write(s"Spark folder: $sparkFolderPath\n") |
| 92 | + output.write(s"Comet folder: $cometFolderPath\n") |
| 93 | + output.write(s"Found ${sparkSubfolders.length} subfolders to compare\n\n") |
| 94 | + |
| 95 | + // Compare each subfolder |
| 96 | + sparkSubfolders.foreach { subfolderName => |
| 97 | + val sparkSubfolderPath = new File(sparkFolder, subfolderName) |
| 98 | + val cometSubfolderPath = new File(cometFolder, subfolderName) |
| 99 | + |
| 100 | + if (!cometSubfolderPath.exists() || !cometSubfolderPath.isDirectory) { |
| 101 | + output.write(s"## Subfolder: $subfolderName\n") |
| 102 | + output.write( |
| 103 | + s"[WARNING] Comet subfolder not found: ${cometSubfolderPath.getAbsolutePath}\n\n") |
| 104 | + } else { |
| 105 | + output.write(s"## Comparing subfolder: $subfolderName\n\n") |
| 106 | + |
| 107 | + try { |
| 108 | + // Read Spark parquet files |
| 109 | + spark.conf.set("spark.comet.enabled", "false") |
| 110 | + val sparkDf = spark.read.parquet(sparkSubfolderPath.getAbsolutePath) |
| 111 | + val sparkRows = sparkDf.orderBy(sparkDf.columns.map(functions.col): _*).collect() |
| 112 | + |
| 113 | + // Read Comet parquet files |
| 114 | + val cometDf = spark.read.parquet(cometSubfolderPath.getAbsolutePath) |
| 115 | + val cometRows = cometDf.orderBy(cometDf.columns.map(functions.col): _*).collect() |
| 116 | + |
| 117 | + // Compare the results |
| 118 | + if (QueryComparison.assertSameRows(sparkRows, cometRows, output)) { |
| 119 | + output.write(s"Subfolder $subfolderName: ${sparkRows.length} rows matched\n\n") |
| 120 | + } |
| 121 | + } catch { |
| 122 | + case e: Exception => |
| 123 | + output.write( |
| 124 | + s"[ERROR] Failed to compare subfolder $subfolderName: ${e.getMessage}\n") |
| 125 | + val sw = new java.io.StringWriter() |
| 126 | + val p = new java.io.PrintWriter(sw) |
| 127 | + e.printStackTrace(p) |
| 128 | + p.close() |
| 129 | + output.write(s"```\n${sw.toString}\n```\n\n") |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + output.flush() |
| 134 | + } |
| 135 | + |
| 136 | + output.write("\n# Comparison Complete\n") |
| 137 | + output.write(s"Compared ${sparkSubfolders.length} subfolders\n") |
| 138 | + |
| 139 | + } finally { |
| 140 | + output.close() |
| 141 | + } |
| 142 | + } |
| 143 | +} |
0 commit comments