Skip to content

Commit 3bac27b

Browse files
authored
Merge pull request #10 from tigergraph/v1.2
V1.2
2 parents 892535f + e5a5ddf commit 3bac27b

19 files changed

+1229
-126
lines changed

README.algorithms

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
README for GSQL Algorithm Library
2-
6/26/19
2+
9/5/19
33

44
The GSQL Graph Algorithm Library is a collection of high-performance GSQL queries,
55
each of which implements a standard graph algorithm. Each algorithm is ready to be
@@ -62,11 +62,12 @@ https://docs.tigergraph.com/graph-algorithm-library
6262

6363
List of GSQL Graph Algorithms
6464
-----------------------------
65-
as of June 26, 2019
65+
as of Sept 5, 2019
6666
Compatible with TigerGraph version 2.1.8 or higher
6767

6868
closeness_cent Closeness Centrality
6969
conn_comp Connected Component Detection
70+
scc Strongly Connected Component Detection
7071
label_prop Label Propagation Method for Community Detection
7172
louvain_parallel Parallel Louvain Modularity Method with Refinement for Community Detection
7273
pageRank PageRank measurement of relative influence of each vertex
@@ -83,7 +84,9 @@ cosine_nbor_ss Cosine Similarity from a single vertex
8384
cosine_nbor_ap Cosine Similarity for each pair of vertices
8485
jaccard_nbor_ss Jaccard Similarity from a single vertex
8586
jaccard_nbor_ap Jaccard Similarity for each pair of vertices
86-
knn_cosine_ss k-Nearest Neighbor classification, using Cosine Similarity
87+
knn_cosine_ss k-Nearest Neighbor classification, using Cosine Similarity, single source
88+
knn_cosine_all k-Nearest Neighbor classification, using Cosine Similarity, batch
89+
knn_cosine_cv Cross validation for k-Nearest Neighbor, using Cosine Similarity
8790

8891
Each of the above may be available as 2 or 3 related queries
8992
For example:
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
CREATE QUERY knn_cosine_ss (VERTEX source, INT topK) FOR GRAPH movie RETURNS (STRING) {
2+
/* This subquery is k-nearest neighbors based on Cosine Similarity between a given vertex and every other vertex.
3+
Cosine similarity = A \dot B / ||A|| \dot ||B||
4+
*/
5+
SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
6+
MapAccum<STRING, INT> @@count;
7+
INT max_count = 0;
8+
STRING predicted_label;
9+
10+
# calculate similarity and find the top k nearest neighbors
11+
start = {source};
12+
subjects = SELECT t
13+
FROM start:s -((Likes):e)-> :t
14+
ACCUM t.@numerator = e.weight,
15+
@@norm1 += pow(e.weight, 2);
16+
17+
neighbours = SELECT t
18+
FROM subjects:s -(Reverse_Likes:e)-> Person:t
19+
WHERE t != source AND t.known_label != "" # only consider the ones with known label
20+
ACCUM t.@numerator += s.@numerator * e.weight;
21+
22+
kNN = SELECT s
23+
FROM neighbours:s -((Likes):e)-> :t
24+
ACCUM s.@norm2 += pow(e.weight, 2)
25+
POST-ACCUM s.@similarity = s.@numerator/sqrt(@@norm1 * s.@norm2)
26+
ORDER BY s.@similarity DESC
27+
LIMIT topK;
28+
29+
#predict label
30+
kNN = SELECT s
31+
FROM kNN:s
32+
ACCUM @@count += (s.known_label -> 1);
33+
34+
FOREACH (label, cnt) IN @@count DO
35+
IF cnt > max_count THEN
36+
max_count = cnt;
37+
predicted_label = label;
38+
END;
39+
END;
40+
41+
PRINT predicted_label;
42+
RETURN predicted_label;
43+
44+
}
45+
46+
CREATE QUERY knn_cosine_all (INT topK) FOR GRAPH movie {
47+
/* This query is k-nearest neighbors based on Cosine Similarity on all vertices.
48+
The output is the predicted label for all the vertices depending on the majority label of their k-nearest neighbors.
49+
*/
50+
SumAccum<STRING> @predicted_label;
51+
52+
source = {Person.*};
53+
source = SELECT s
54+
FROM source:s
55+
WHERE s.known_label == ""
56+
ACCUM s.@predicted_label = knn_cosine_ss(s, topK);
57+
PRINT source;
58+
59+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
CREATE QUERY knn_cosine_ss_attr (VERTEX source, INT topK) FOR GRAPH movie RETURNS (STRING) {
2+
/* This subquery is k-nearest neighbors based on Cosine Similarity between a given vertex and every other vertex.
3+
Cosine similarity = A \dot B / ||A|| \dot ||B||
4+
*/
5+
SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
6+
MapAccum<STRING, INT> @@count;
7+
INT max_count = 0;
8+
STRING predicted_label;
9+
10+
# calculate similarity and find the top k nearest neighbors
11+
start = {source};
12+
subjects = SELECT t
13+
FROM start:s -((Likes):e)-> :t
14+
ACCUM t.@numerator = e.weight,
15+
@@norm1 += pow(e.weight, 2);
16+
17+
neighbours = SELECT t
18+
FROM subjects:s -(Reverse_Likes:e)-> Person:t
19+
WHERE t != source AND t.known_label != "" # only consider the ones with known label
20+
ACCUM t.@numerator += s.@numerator * e.weight;
21+
22+
kNN = SELECT s
23+
FROM neighbours:s -((Likes):e)-> :t
24+
ACCUM s.@norm2 += pow(e.weight, 2)
25+
POST-ACCUM s.@similarity = s.@numerator/sqrt(@@norm1 * s.@norm2)
26+
ORDER BY s.@similarity DESC
27+
LIMIT topK;
28+
29+
#predict label
30+
kNN = SELECT s
31+
FROM kNN:s
32+
ACCUM @@count += (s.known_label -> 1);
33+
34+
FOREACH (label, cnt) IN @@count DO
35+
IF cnt > max_count THEN
36+
max_count = cnt;
37+
predicted_label = label;
38+
END;
39+
END;
40+
41+
PRINT predicted_label;
42+
RETURN predicted_label;
43+
44+
}
45+
46+
CREATE QUERY knn_cosine_all_attr (INT topK) FOR GRAPH movie {
47+
/* This query is k-nearest neighbors based on Cosine Similarity on all vertices.
48+
The output is the predicted label for all the vertices depending on the majority label of their k-nearest neighbors.
49+
*/
50+
SumAccum<STRING> @predicted_label;
51+
52+
source = {Person.*};
53+
source = SELECT s
54+
FROM source:s
55+
WHERE s.known_label == ""
56+
POST-ACCUM s.predicted_label = knn_cosine_ss(s, topK);
57+
58+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
CREATE QUERY knn_cosine_ss_file (VERTEX source, INT topK) FOR GRAPH movie RETURNS (STRING) {
2+
/* This subquery is k-nearest neighbors based on Cosine Similarity between a given vertex and every other vertex.
3+
Cosine similarity = A \dot B / ||A|| \dot ||B||
4+
*/
5+
SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
6+
MapAccum<STRING, INT> @@count;
7+
INT max_count = 0;
8+
STRING predicted_label;
9+
10+
# calculate similarity and find the top k nearest neighbors
11+
start = {source};
12+
subjects = SELECT t
13+
FROM start:s -((Likes):e)-> :t
14+
ACCUM t.@numerator = e.weight,
15+
@@norm1 += pow(e.weight, 2);
16+
17+
neighbours = SELECT t
18+
FROM subjects:s -(Reverse_Likes:e)-> Person:t
19+
WHERE t != source AND t.known_label != "" # only consider the ones with known label
20+
ACCUM t.@numerator += s.@numerator * e.weight;
21+
22+
kNN = SELECT s
23+
FROM neighbours:s -((Likes):e)-> :t
24+
ACCUM s.@norm2 += pow(e.weight, 2)
25+
POST-ACCUM s.@similarity = s.@numerator/sqrt(@@norm1 * s.@norm2)
26+
ORDER BY s.@similarity DESC
27+
LIMIT topK;
28+
29+
#predict label
30+
kNN = SELECT s
31+
FROM kNN:s
32+
ACCUM @@count += (s.known_label -> 1);
33+
34+
FOREACH (label, cnt) IN @@count DO
35+
IF cnt > max_count THEN
36+
max_count = cnt;
37+
predicted_label = label;
38+
END;
39+
END;
40+
41+
PRINT predicted_label;
42+
RETURN predicted_label;
43+
44+
}
45+
46+
CREATE QUERY knn_cosine_all_file (INT topK, FILE f) FOR GRAPH movie {
47+
/* This query is k-nearest neighbors based on Cosine Similarity on all vertices.
48+
The output is the predicted label for all the vertices depending on the majority label of their k-nearest neighbors.
49+
*/
50+
SumAccum<STRING> @predicted_label;
51+
52+
source = {Person.*};
53+
source = SELECT s
54+
FROM source:s
55+
WHERE s.known_label == ""
56+
ACCUM s.@predicted_label = knn_cosine_ss(s, topK)
57+
POST-ACCUM f.println(s, s.@predicted_label);
58+
59+
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
CREATE QUERY knn_cosine_cv_sub (VERTEX source, INT max_k) FOR GRAPH movie RETURNS (ListAccum<STRING>) {
2+
/* This subquery returns a list of predicted label for a source vertex with respect to different k within a given range.
3+
*/
4+
TYPEDEF TUPLE <label STRING, similarity FLOAT> Label_Score;
5+
HeapAccum<Label_Score>(max_k, similarity DESC) @@top_labels_heap; # heap stores the (label, similarity) tuple, order by similarity score
6+
SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
7+
MapAccum<STRING, INT> @@count;
8+
ListAccum<STRING> @@predicted_label_lists; # list of predicted labels to return
9+
INT max_count = 0;
10+
STRING predicted_label; # predicted label in each iteration
11+
INT k;
12+
13+
# calculate similarity and find the top k nearest neighbors
14+
start = {source};
15+
subjects = SELECT t
16+
FROM start:s -((Likes):e)-> :t
17+
ACCUM t.@numerator = e.weight,
18+
@@norm1 += pow(e.weight, 2);
19+
20+
neighbours = SELECT t
21+
FROM subjects:s -(Reverse_Likes:e)-> :t
22+
WHERE t != source AND t.known_label != "" # only consider the neighbors with known label
23+
ACCUM t.@numerator += s.@numerator * e.weight;
24+
25+
kNN = SELECT s
26+
FROM neighbours:s -((Likes):e)-> :t
27+
ACCUM s.@norm2 += pow(e.weight, 2)
28+
POST-ACCUM @@top_labels_heap += Label_Score(s.known_label, s.@numerator/sqrt(@@norm1 * s.@norm2)); # store the label and similarity score in a heap
29+
30+
# iterate the heap and calculate label count for different k
31+
k = 1;
32+
FOREACH item IN @@top_labels_heap DO
33+
@@count += (item.label -> 1); # count is a map, key is the label, value is the count of the label
34+
IF @@count.get(item.label) > max_count THEN
35+
max_count = @@count.get(item.label);
36+
predicted_label = item.label;
37+
END;
38+
@@predicted_label_lists += predicted_label; # list of predicted labels
39+
k = k+1;
40+
END;
41+
42+
PRINT @@predicted_label_lists;
43+
RETURN @@predicted_label_lists;
44+
}
45+
46+
47+
CREATE QUERY knn_cosine_cv (INT min_k, INT max_k) FOR GRAPH movie RETURNS (INT){
48+
/* Leave-one-out cross validation for selecting optimal k.
49+
The input is a range of k, output is the k with highest correct prediction rate.
50+
Note: When one vertex has no neighbor with known label, the prediction is considered false
51+
*/
52+
ListAccum<FLOAT> @@correct_rate_list;
53+
ListAccum<INT> @is_correct_list;
54+
ListAccum<STRING> @predicted_label_list;
55+
SumAccum<FLOAT> @@total_score;
56+
INT n, k, best_k=1;
57+
FLOAT max_rate=0;
58+
59+
IF max_k < min_k OR max_k < 1 THEN // terminate if the range is invalid
60+
RETURN 0;
61+
END;
62+
start = {Person.*};
63+
64+
start = SELECT s
65+
FROM start:s
66+
WHERE s.known_label != "" // get the vertices with known label
67+
ACCUM s.@predicted_label_list = knn_cosine_cv_sub(s, max_k) // get a list of predicted label wrt different k
68+
POST-ACCUM FOREACH label IN s.@predicted_label_list DO
69+
IF s.known_label == label THEN # *vStrAttrOld* means no neighbor with label
70+
s.@is_correct_list += 1
71+
ELSE
72+
s.@is_correct_list += 0
73+
END
74+
END;
75+
76+
n = start.size();
77+
k = min_k-1; # index starts from 0
78+
WHILE k < max_k DO
79+
@@total_score = 0;
80+
start = SELECT s
81+
FROM start:s
82+
ACCUM IF s.@is_correct_list.size()==0 THEN
83+
@@total_score += 0 # if there is no neighbor, it is considered incorrect prediction
84+
ELSE IF k >= s.@is_correct_list.size() THEN
85+
@@total_score += s.@is_correct_list.get(s.@is_correct_list.size()-1) # use all neighbors it has when it is not enough
86+
ELSE
87+
@@total_score += s.@is_correct_list.get(k)
88+
END;
89+
@@correct_rate_list += @@total_score / n;
90+
IF @@total_score / n > max_rate THEN
91+
max_rate = @@total_score / n; # store the max correct rate in max_rate
92+
best_k = k+1;
93+
END;
94+
k = k+1;
95+
END;
96+
97+
PRINT @@correct_rate_list;
98+
PRINT best_k;
99+
RETURN best_k;
100+
}

algorithms/examples/Classification/knn_cosine_ss.gsql

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,24 @@ Cosine similarity = A \dot B / ||A|| \dot ||B||
44
The output is the predicted label for the source vertex, which is the majority label of its k-nearest neighbors.
55
*/
66
SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
7-
MapAccum<STRING, INT> @@mapCountLabels;
8-
INT maxCount = 0;
9-
STRING chosenLabel;
7+
MapAccum<STRING, INT> @@labels_count_map;
8+
INT max_count = 0;
9+
STRING predicted_label;
1010

1111
# calculate similarity and find the top k nearest neighbors
12-
Start = {source};
13-
Subjects = SELECT t
14-
FROM Start:s -(Likes:e)-> :t
12+
start = {source};
13+
subjects = SELECT t
14+
FROM start:s -((Likes):e)-> :t
1515
ACCUM t.@numerator = e.weight,
1616
@@norm1 += pow(e.weight, 2);
1717

18-
Neighbours = SELECT t
19-
FROM Subjects:s -(Reverse_Likes:e)-> Person:t
18+
neighbours = SELECT t
19+
FROM subjects:s -(Reverse_Likes:e)-> Person:t
2020
WHERE t != source AND t.known_label != "" # only consider the neighbours with known label
2121
ACCUM t.@numerator += s.@numerator * e.weight;
2222

2323
kNN = SELECT s
24-
FROM Neighbours:s -(Likes:e)-> :t
24+
FROM neighbours:s -((Likes):e)-> :t
2525
ACCUM s.@norm2 += pow(e.weight, 2)
2626
POST-ACCUM s.@similarity = s.@numerator/sqrt(@@norm1 * s.@norm2)
2727
ORDER BY s.@similarity DESC
@@ -30,15 +30,15 @@ The output is the predicted label for the source vertex, which is the majority l
3030
#predict label
3131
kNN = SELECT s
3232
FROM kNN:s
33-
ACCUM @@mapCountLabels += (s.known_label -> 1);
33+
ACCUM @@labels_count_map += (s.known_label -> 1);
3434

35-
FOREACH (label, cnt) IN @@mapCountLabels DO
36-
IF cnt > maxCount THEN
37-
maxCount = cnt;
38-
chosenLabel = label;
35+
FOREACH (label, cnt) IN @@labels_count_map DO
36+
IF cnt > max_count THEN
37+
max_count = cnt;
38+
predicted_label = label;
3939
END;
4040
END;
4141

42-
PRINT chosenLabel;
43-
RETURN chosenLabel;
42+
PRINT predicted_label;
43+
RETURN predicted_label;
4444
}

0 commit comments

Comments
 (0)