Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/perfTest-febrl120K.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
- name: setup spark
uses: vemonet/setup-spark@v1
with:
spark-version: '3.5.0'
spark-version: '3.5.5'
hadoop-version: '3'
- name: check spark
run: spark-submit --version
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/perfTest-ncVoters5M.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
- name: setup spark
uses: vemonet/setup-spark@v1
with:
spark-version: '3.5.0'
spark-version: '3.5.5'
hadoop-version: '3'
- name: check spark
run: spark-submit --version
Expand Down
11 changes: 0 additions & 11 deletions common/client/src/main/java/zingg/common/client/util/DSUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,6 @@ public ZFrame<D, R, C> join(ZFrame<D, R, C> lines, ZFrame<D, R, C> lines1, Strin
return pairs;
}

public ZFrame<D, R, C> joinZColFirst(ZFrame<D, R, C> lines, ZFrame<D, R, C> lines1, String joinColumn, boolean filter) {
ZFrame<D, R, C> pairs = lines.joinRight(lines1, joinColumn);
//in training, we only need that record matches only with lines bigger than itself
//in the case of normal as well as in the case of linking
if (LOG.isDebugEnabled()) {
LOG.debug("pairs length " + pairs.count());
}
if (filter) pairs = pairs.filter(pairs.gt(ColName.ID_COL));
return pairs;
}

public ZFrame<D, R, C> addUniqueCol(ZFrame<D, R, C> dupesActual, String colName) {
String append = System.currentTimeMillis() + ":";
dupesActual = dupesActual.withColumn(colName + "temp",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ public void execute() throws ZinggClientException {
//dupesActual.explain();
//dupesActual.toJavaRDD().saveAsTextFile("/tmp/zdupes");

writeOutput(testDataOriginal, dupesActual);
writeOutput(getOutput(testDataOriginal, dupesActual));

} catch (Exception e) {
if (LOG.isDebugEnabled()) e.printStackTrace();
Expand All @@ -218,13 +218,16 @@ public IMatchOutputBuilder<S,D,R,C> getMatchOutputBuilder(){
return this.matchOutputBuilder;
}


public void writeOutput( ZFrame<D,R,C> testDataOriginal, ZFrame<D,R,C> dupesActual) throws ZinggClientException {
public ZFrame<D,R,C> getOutput(ZFrame<D,R,C> testDataOriginal, ZFrame<D,R,C> dupesActual) throws ZinggClientException, Exception{
ZFrame<D, R, C> graphWithScores = getMatchOutputBuilder().getOutput(testDataOriginal, dupesActual);
return graphWithScores;
}

public void writeOutput( ZFrame<D,R,C> graphWithScores) throws ZinggClientException {
try{
//input dupes are pairs
///pick ones according to the threshold by user
//all clusters consolidated in one place
ZFrame<D, R, C> graphWithScores = getMatchOutputBuilder().getOutput(testDataOriginal, dupesActual);
setOutput(graphWithScores);
if (args.getOutput() != null && toWrite) {
getPipeUtil().write(graphWithScores, args.getOutput());
Expand All @@ -235,7 +238,6 @@ public void writeOutput( ZFrame<D,R,C> testDataOriginal, ZFrame<D,R,C> dupesA
}

}


protected abstract StopWordsRemover<S,D,R,C,T> getStopWords();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,11 @@ public GraphUtil<D, R, C> getGraphUtil() {
return graphUtil;
}



public void setGraphUtil(GraphUtil<D, R, C> graphUtil) {
this.graphUtil = graphUtil;
}

public ZFrame<D, R, C> getOutput(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesActual) throws ZinggClientException, Exception {
public ZFrame<D,R,C> getGraph(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesActual) throws ZinggClientException{
//-1 is initial suggestion, 1 is add, 0 is deletion, 2 is unsure
/*blocked = blocked.drop(ColName.HASH_COL);
blocked = blocked.drop(ColName.SOURCE_COL);
Expand All @@ -54,15 +52,24 @@ public ZFrame<D, R, C> getOutput(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesA
graph.show();

}

return graph;
}

public ZFrame<D,R,C> getScores(ZFrame<D, R, C> graph, ZFrame<D, R, C> dupesActual) throws Exception{
//write score
ZFrame<D,R,C>score = getMinMaxScores(dupesActual, graph).cache();
ZFrame<D,R,C> score = getMinMaxScores(dupesActual, graph).cache();
//score.toJavaRDD().coalesce(1).saveAsTextFile("/tmp/zallscoresAvg");
graph = graph.repartition(args.getNumPartitions(), graph.col(ColName.ID_COL)).cache();
if (LOG.isDebugEnabled()) {
score.show();
}
ZFrame<D, R, C> graphWithScores = getGraphWithScores(graph, score);
//graphWithScores.toJavaRDD().saveAsTextFile("/tmp/zgraphWScores");
return graphWithScores;
}

public ZFrame<D, R, C> dropColumns(ZFrame<D, R, C> graphWithScores){
graphWithScores = graphWithScores.drop(ColName.HASH_COL);
graphWithScores = graphWithScores.drop(ColName.COL_PREFIX + ColName.ID_COL);
graphWithScores = graphWithScores.drop(ColName.ID_COL);
Expand All @@ -79,9 +86,23 @@ public ZFrame<D, R, C> getOutput(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesA
return graphWithScores;
}

public ZFrame<D, R, C> getOutput(ZFrame<D, R, C> blocked, ZFrame<D, R, C> dupesActual) throws ZinggClientException, Exception {
ZFrame<D,R,C> graph = getGraph(blocked, dupesActual);
ZFrame<D,R,C> score = getScores(graph, dupesActual);
ZFrame<D, R, C> graphWithScores = dropColumns(score);
return graphWithScores;
}

protected ZFrame<D, R, C> getGraphWithScoresOrig(ZFrame<D, R, C> graph, ZFrame<D, R, C> score) {
ZFrame<D,R,C>graphWithScores = getDSUtil().joinZColFirst(
score, graph, ColName.ID_COL, false).cache();
graph = graph.withColumnRenamed(ColName.ID_COL, ColName.COL_PREFIX + ColName.ID_COL);
ZFrame<D, R, C> pairs = score.join(graph, ColName.ID_COL, true, "right");
//in training, we only need that record matches only with lines bigger than itself
//in the case of normal as well as in the case of linking
if (LOG.isDebugEnabled()) {
LOG.debug("pairs length " + pairs.count());
}
//if (filter) pairs = pairs.filter(pairs.gt(ColName.ID_COL));
ZFrame<D,R,C> graphWithScores = pairs.drop(ColName.COL_PREFIX + ColName.ID_COL).cache();
return graphWithScores;
}

Expand Down
1 change: 1 addition & 0 deletions config/zingg.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ spark.debug.maxToStringFields=200
spark.sql.debug.maxToStringFields=200
spark.driver.memory=8g
spark.executor.memory=8g
spark.sql.adaptive.enabled=false
#spark.jars=/home/zingg/pathto.jar
# Additional Jars could be passed to spark through below configuration. Jars list should be comma(,) separated.
#spark.jars=
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
</property>
</activation>
<properties>
<spark.version>3.5.0</spark.version>
<spark.version>3.5.5</spark.version>
<scala.version>2.12.10</scala.version>
<spark.binary.version>3.5</spark.binary.version>
<scala.binary.version>2.12</scala.binary.version>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import zingg.common.client.IZingg;
import zingg.spark.core.context.ZinggSparkContext;

import java.util.Properties;

public class SparkSessionProvider {

private static SparkSessionProvider sparkSessionProvider;
Expand All @@ -23,17 +25,16 @@ public class SparkSessionProvider {
private void initializeSession() {
if (sparkSession == null) {
try {
String sparkDriverMemory = System.getenv("SPARK_DRIVER_MEMORY");
if (sparkDriverMemory == null) {
sparkDriverMemory = "1g";
}
sparkSession = SparkSession
SparkSession.Builder builder = SparkSession
.builder()
.master("local[*]")
.appName("ZinggJunit")
.config("spark.debug.maxToStringFields", 100)
.config("spark.driver.memory", sparkDriverMemory)
.getOrCreate();
.appName("ZinggJunit");
Properties props = new Properties();
props.load(getClass().getResourceAsStream("/zingg.properties"));
for (String key : props.stringPropertyNames()) {
builder = builder.config(key, props.getProperty(key));
}
sparkSession = builder.getOrCreate();
SparkContext sparkContext = sparkSession.sparkContext();
long driverMemory = sparkContext.getConf().getSizeAsGb("spark.driver.memory", "0");
System.out.println("Spark driver memory: " + driverMemory + " GB");
Expand Down Expand Up @@ -66,8 +67,6 @@ public static SparkSessionProvider getInstance() {
return sparkSessionProvider;
}



//set getters
public SparkSession getSparkSession() {
return this.sparkSession;
Expand All @@ -84,4 +83,4 @@ public ZinggSparkContext getZinggSparkContext() {
public IArguments getArgs() {
return this.args;
}
}
}
4 changes: 4 additions & 0 deletions spark/core/src/test/resources/zingg.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
spark.executor.memory=8g
spark.driver.memory=8g
spark.sql.adaptive.enabled=false
spark.debug.maxToStringFields=100
Loading