zinggAI · sania-16 · May 27, 2025 · Jun 2, 2025 · Jun 2, 2025 · Jun 18, 2025
diff --git a/.github/workflows/perfTest-febrl120K.yml b/.github/workflows/perfTest-febrl120K.yml
@@ -27,7 +27,7 @@ jobs:
       - name: setup spark
         uses: vemonet/setup-spark@v1
         with:
-          spark-version: '3.5.0'
+          spark-version: '3.5.5'
           hadoop-version: '3'
       - name: check spark
         run: spark-submit --version

diff --git a/.github/workflows/perfTest-ncVoters5M.yml b/.github/workflows/perfTest-ncVoters5M.yml
@@ -27,7 +27,7 @@ jobs:
       - name: setup spark
         uses: vemonet/setup-spark@v1
         with:
-          spark-version: '3.5.0'
+          spark-version: '3.5.5'
           hadoop-version: '3'
       - name: check spark
         run: spark-submit --version

diff --git a/common/client/src/main/java/zingg/common/client/util/DSUtil.java b/common/client/src/main/java/zingg/common/client/util/DSUtil.java
@@ -68,17 +68,6 @@ public ZFrame<D, R, C> join(ZFrame<D, R, C> lines, ZFrame<D, R, C> lines1, Strin
 		return pairs;
 	}
 
-	public ZFrame<D, R, C> joinZColFirst(ZFrame<D, R, C> lines, ZFrame<D, R, C> lines1, String joinColumn, boolean filter) {
-		ZFrame<D, R, C> pairs = lines.joinRight(lines1, joinColumn);
-		//in training, we only need that record matches only with lines bigger than itself
-		//in the case of normal as well as in the case of linking
-		if (LOG.isDebugEnabled()) {
-			LOG.debug("pairs length " + pairs.count());
-		}
-		if (filter) pairs = pairs.filter(pairs.gt(ColName.ID_COL));		
-		return pairs;
-	}
-
 	public ZFrame<D, R, C> addUniqueCol(ZFrame<D, R, C> dupesActual, String colName) {
 		String append = System.currentTimeMillis() + ":";
 		dupesActual = dupesActual.withColumn(colName + "temp", 

diff --git a/common/core/src/main/java/zingg/common/core/executor/Matcher.java b/common/core/src/main/java/zingg/common/core/executor/Matcher.java
@@ -198,7 +198,7 @@ public void execute() throws ZinggClientException {
 			//dupesActual.explain();
 			//dupesActual.toJavaRDD().saveAsTextFile("/tmp/zdupes");
 
-			writeOutput(testDataOriginal, dupesActual);		
+			writeOutput(getOutput(testDataOriginal, dupesActual));		
 
 		} catch (Exception e) {
 			if (LOG.isDebugEnabled()) e.printStackTrace();
@@ -218,13 +218,16 @@ public IMatchOutputBuilder<S,D,R,C> getMatchOutputBuilder(){
 		return this.matchOutputBuilder;
 	}
 
-
-	public void writeOutput( ZFrame<D,R,C>  testDataOriginal,  ZFrame<D,R,C>  dupesActual) throws ZinggClientException {
+	public ZFrame<D,R,C> getOutput(ZFrame<D,R,C>  testDataOriginal,  ZFrame<D,R,C>  dupesActual) throws ZinggClientException, Exception{
+		ZFrame<D, R, C> graphWithScores = getMatchOutputBuilder().getOutput(testDataOriginal, dupesActual);
+		return graphWithScores;
+	}
+
+	public void writeOutput( ZFrame<D,R,C>  graphWithScores) throws ZinggClientException {
 		try{
 		//input dupes are pairs
 		///pick ones according to the threshold by user
 		//all clusters consolidated in one place
-		ZFrame<D, R, C> graphWithScores = getMatchOutputBuilder().getOutput(testDataOriginal, dupesActual);
 		setOutput(graphWithScores);
 		if (args.getOutput() != null && toWrite) {
 				getPipeUtil().write(graphWithScores, args.getOutput());
@@ -235,7 +238,6 @@ public void writeOutput( ZFrame<D,R,C>  testDataOriginal,  ZFrame<D,R,C>  dupesA
 		}
 
 	}
-
 
     protected abstract StopWordsRemover<S,D,R,C,T> getStopWords();
 

diff --git a/common/core/src/main/java/zingg/common/core/match/output/GraphMatchOutputBuilder.java b/common/core/src/main/java/zingg/common/core/match/output/GraphMatchOutputBuilder.java
@@ -28,13 +28,11 @@ public GraphUtil<D, R, C> getGraphUtil() {
         return graphUtil;
     }
 
-
-
     public void setGraphUtil(GraphUtil<D, R, C> graphUtil) {
         this.graphUtil = graphUtil;
     }
 
-    public ZFrame<D, R, C> getOutput(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesActual) throws ZinggClientException, Exception {
+	public ZFrame<D,R,C> getGraph(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesActual) throws ZinggClientException{
 		//-1 is initial suggestion, 1 is add, 0 is deletion, 2 is unsure
 		/*blocked = blocked.drop(ColName.HASH_COL);
 		blocked = blocked.drop(ColName.SOURCE_COL);
@@ -54,15 +52,24 @@ public ZFrame<D, R, C> getOutput(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesA
 			graph.show();
 
 		}
+
+		return graph;
+	}
+
+	public ZFrame<D,R,C> getScores(ZFrame<D, R, C> graph, ZFrame<D, R, C> dupesActual) throws Exception{
 		//write score
-		ZFrame<D,R,C>score = getMinMaxScores(dupesActual, graph).cache();
+		ZFrame<D,R,C> score = getMinMaxScores(dupesActual, graph).cache();
 		//score.toJavaRDD().coalesce(1).saveAsTextFile("/tmp/zallscoresAvg");
 		graph = graph.repartition(args.getNumPartitions(), graph.col(ColName.ID_COL)).cache();
 		if (LOG.isDebugEnabled()) {
 			score.show();
 		}
 		ZFrame<D, R, C> graphWithScores = getGraphWithScores(graph, score);
 			//graphWithScores.toJavaRDD().saveAsTextFile("/tmp/zgraphWScores");
+		return graphWithScores;	
+	}
+
+	public ZFrame<D, R, C> dropColumns(ZFrame<D, R, C> graphWithScores){
 		graphWithScores = graphWithScores.drop(ColName.HASH_COL);
 		graphWithScores = graphWithScores.drop(ColName.COL_PREFIX + ColName.ID_COL);
 		graphWithScores = graphWithScores.drop(ColName.ID_COL);
@@ -79,9 +86,23 @@ public ZFrame<D, R, C> getOutput(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesA
 		return graphWithScores;
 	}
 
+    public ZFrame<D, R, C> getOutput(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesActual) throws ZinggClientException, Exception {
+		ZFrame<D,R,C> graph = getGraph(blocked, dupesActual);
+		ZFrame<D,R,C> score = getScores(graph, dupesActual);
+		ZFrame<D, R, C> graphWithScores = dropColumns(score);
+		return graphWithScores;
+	}
+
     protected ZFrame<D, R, C> getGraphWithScoresOrig(ZFrame<D, R, C> graph, ZFrame<D, R, C> score) {
-		ZFrame<D,R,C>graphWithScores = getDSUtil().joinZColFirst(
-			score, graph, ColName.ID_COL, false).cache();
+		graph = graph.withColumnRenamed(ColName.ID_COL, ColName.COL_PREFIX + ColName.ID_COL);
+		ZFrame<D, R, C> pairs = score.join(graph, ColName.ID_COL, true, "right");
+		//in training, we only need that record matches only with lines bigger than itself
+		//in the case of normal as well as in the case of linking
+		if (LOG.isDebugEnabled()) {
+			LOG.debug("pairs length " + pairs.count());
+		}
+		//if (filter) pairs = pairs.filter(pairs.gt(ColName.ID_COL));		
+		ZFrame<D,R,C> graphWithScores = pairs.drop(ColName.COL_PREFIX + ColName.ID_COL).cache();
 		return graphWithScores;
 	}
 

diff --git a/config/zingg.conf b/config/zingg.conf
@@ -16,6 +16,7 @@ spark.debug.maxToStringFields=200
 spark.sql.debug.maxToStringFields=200
 spark.driver.memory=8g
 spark.executor.memory=8g
+spark.sql.adaptive.enabled=false
 #spark.jars=/home/zingg/pathto.jar
 # Additional Jars could be passed to spark through below configuration. Jars list should be comma(,) separated. 
 #spark.jars=

diff --git a/pom.xml b/pom.xml
@@ -55,7 +55,7 @@
 				</property>
 			</activation>
 			<properties>
-				<spark.version>3.5.0</spark.version>
+				<spark.version>3.5.5</spark.version>
 				<scala.version>2.12.10</scala.version>
 				<spark.binary.version>3.5</spark.binary.version>
 				<scala.binary.version>2.12</scala.binary.version>

diff --git a/spark/core/src/test/java/zingg/spark/core/session/SparkSessionProvider.java b/spark/core/src/test/java/zingg/spark/core/session/SparkSessionProvider.java
@@ -10,6 +10,8 @@
 import zingg.common.client.IZingg;
 import zingg.spark.core.context.ZinggSparkContext;
 
+import java.util.Properties;
+
 public class SparkSessionProvider {
 
     private static SparkSessionProvider sparkSessionProvider;
@@ -23,17 +25,16 @@ public class SparkSessionProvider {
     private void initializeSession() {
         if (sparkSession == null) {
             try {
-                String sparkDriverMemory = System.getenv("SPARK_DRIVER_MEMORY");
-                if (sparkDriverMemory == null) {
-                    sparkDriverMemory = "1g";
-                }
-                sparkSession = SparkSession
+                SparkSession.Builder builder = SparkSession
                         .builder()
                         .master("local[*]")
-                        .appName("ZinggJunit")
-                        .config("spark.debug.maxToStringFields", 100)
-                        .config("spark.driver.memory", sparkDriverMemory)
-                        .getOrCreate();
+                        .appName("ZinggJunit");
+                Properties props = new Properties();
+                props.load(getClass().getResourceAsStream("/zingg.properties"));
+                for (String key : props.stringPropertyNames()) {
+                    builder = builder.config(key, props.getProperty(key));
+                }
+                sparkSession = builder.getOrCreate();
                 SparkContext sparkContext = sparkSession.sparkContext();
                 long driverMemory = sparkContext.getConf().getSizeAsGb("spark.driver.memory", "0");
                 System.out.println("Spark driver memory: " + driverMemory + " GB");
@@ -66,8 +67,6 @@ public static SparkSessionProvider getInstance() {
         return sparkSessionProvider;
     }
 
-
-
     //set getters
     public SparkSession getSparkSession() {
         return this.sparkSession;
@@ -84,4 +83,4 @@ public ZinggSparkContext getZinggSparkContext() {
     public IArguments getArgs() {
         return this.args;
     }
-}
+}
diff --git a/spark/core/src/test/resources/zingg.properties b/spark/core/src/test/resources/zingg.properties
@@ -0,0 +1,4 @@
+spark.executor.memory=8g
+spark.driver.memory=8g
+spark.sql.adaptive.enabled=false
+spark.debug.maxToStringFields=100