tigergraph
diff --git a/‎README.algorithms
Lines changed: 6 additions & 3 deletions b/‎README.algorithms
Lines changed: 6 additions & 3 deletions
diff --git a/‎algorithms/examples/Classification/knn_cosine_all.gsql
Lines changed: 59 additions & 0 deletions b/‎algorithms/examples/Classification/knn_cosine_all.gsql
Lines changed: 59 additions & 0 deletions
diff --git a/‎algorithms/examples/Classification/knn_cosine_all_attr.gsql
Lines changed: 58 additions & 0 deletions b/‎algorithms/examples/Classification/knn_cosine_all_attr.gsql
Lines changed: 58 additions & 0 deletions
diff --git a/‎algorithms/examples/Classification/knn_cosine_all_file.gsql
Lines changed: 59 additions & 0 deletions b/‎algorithms/examples/Classification/knn_cosine_all_file.gsql
Lines changed: 59 additions & 0 deletions
diff --git a/‎algorithms/examples/Classification/knn_cosine_cv.gsql
Lines changed: 100 additions & 0 deletions b/‎algorithms/examples/Classification/knn_cosine_cv.gsql
Lines changed: 100 additions & 0 deletions
diff --git a/‎algorithms/examples/Classification/knn_cosine_ss.gsql
Lines changed: 16 additions & 16 deletions b/‎algorithms/examples/Classification/knn_cosine_ss.gsql
Lines changed: 16 additions & 16 deletions
@@ -1,5 +1,5 @@
 README for GSQL Algorithm Library
-6/26/19
+9/5/19
 
 The GSQL Graph Algorithm Library is a collection of high-performance GSQL queries,
 each of which implements a standard graph algorithm. Each algorithm is ready to be
@@ -62,11 +62,12 @@ https://docs.tigergraph.com/graph-algorithm-library
 
 List of GSQL Graph Algorithms
 -----------------------------
-as of June 26, 2019
+as of Sept 5, 2019
 Compatible with TigerGraph version 2.1.8 or higher
 
 closeness_cent                  Closeness Centrality
 conn_comp                       Connected Component Detection
+scc				Strongly Connected Component Detection
 label_prop                      Label Propagation Method for Community Detection
 louvain_parallel                Parallel Louvain Modularity Method with Refinement for Community Detection
 pageRank                        PageRank measurement of relative influence of each vertex
@@ -83,7 +84,9 @@ cosine_nbor_ss                  Cosine Similarity from a single vertex
 cosine_nbor_ap                  Cosine Similarity for each pair of vertices
 jaccard_nbor_ss                 Jaccard Similarity from a single vertex
 jaccard_nbor_ap                 Jaccard Similarity for each pair of vertices
-knn_cosine_ss			k-Nearest Neighbor classification, using Cosine Similarity
+knn_cosine_ss			k-Nearest Neighbor classification, using Cosine Similarity, single source
+knn_cosine_all			k-Nearest Neighbor classification, using Cosine Similarity, batch
+knn_cosine_cv			Cross validation for k-Nearest Neighbor, using Cosine Similarity
 
 Each of the above  may be available as 2 or 3 related queries
 For example:
 
@@ -0,0 +1,59 @@
+CREATE QUERY knn_cosine_ss (VERTEX source, INT topK) FOR GRAPH movie RETURNS (STRING) {
+/* This subquery is k-nearest neighbors based on Cosine Similarity between a given vertex and every other vertex.
+Cosine similarity = A \dot B / ||A|| \dot ||B||
+*/
+        SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
+        MapAccum<STRING, INT> @@count;
+        INT max_count = 0;
+        STRING predicted_label;
+
+        # calculate similarity and find the top k nearest neighbors
+        start = {source};
+        subjects = SELECT t
+                   FROM start:s -((Likes):e)-> :t
+                   ACCUM t.@numerator = e.weight,
+                         @@norm1 += pow(e.weight, 2);
+
+        neighbours = SELECT t
+                     FROM subjects:s -(Reverse_Likes:e)-> Person:t
+                     WHERE t != source AND t.known_label != ""    # only consider the ones with known label
+                     ACCUM t.@numerator += s.@numerator * e.weight;
+
+        kNN = SELECT s
+              FROM neighbours:s -((Likes):e)-> :t
+              ACCUM s.@norm2 += pow(e.weight, 2)
+              POST-ACCUM s.@similarity = s.@numerator/sqrt(@@norm1 * s.@norm2)
+              ORDER BY s.@similarity DESC
+              LIMIT topK;
+
+        #predict label
+        kNN = SELECT s
+              FROM kNN:s
+              ACCUM @@count += (s.known_label -> 1);
+
+        FOREACH (label, cnt) IN @@count DO
+            IF cnt > max_count THEN
+                max_count = cnt;
+                predicted_label = label;
+            END;
+        END;
+
+        PRINT predicted_label;
+        RETURN predicted_label;
+
+}
+
+CREATE QUERY knn_cosine_all (INT topK) FOR GRAPH movie {
+/* This query is k-nearest neighbors based on Cosine Similarity on all vertices.
+   The output is the predicted label for all the vertices depending on the majority label of their k-nearest neighbors.
+*/
+        SumAccum<STRING> @predicted_label;
+        
+        source = {Person.*};        
+        source = SELECT s
+                 FROM source:s 
+                 WHERE s.known_label == ""
+                 ACCUM s.@predicted_label = knn_cosine_ss(s, topK);    
+        PRINT source;
+  
+}
@@ -0,0 +1,58 @@
+CREATE QUERY knn_cosine_ss_attr (VERTEX source, INT topK) FOR GRAPH movie RETURNS (STRING) {
+/* This subquery is k-nearest neighbors based on Cosine Similarity between a given vertex and every other vertex.
+Cosine similarity = A \dot B / ||A|| \dot ||B||
+*/
+        SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
+        MapAccum<STRING, INT> @@count;
+        INT max_count = 0;
+        STRING predicted_label;
+
+        # calculate similarity and find the top k nearest neighbors
+        start = {source};
+        subjects = SELECT t
+                   FROM start:s -((Likes):e)-> :t
+                   ACCUM t.@numerator = e.weight,
+                         @@norm1 += pow(e.weight, 2);
+
+        neighbours = SELECT t
+                     FROM subjects:s -(Reverse_Likes:e)-> Person:t
+                     WHERE t != source AND t.known_label != ""    # only consider the ones with known label
+                     ACCUM t.@numerator += s.@numerator * e.weight;
+
+        kNN = SELECT s
+              FROM neighbours:s -((Likes):e)-> :t
+              ACCUM s.@norm2 += pow(e.weight, 2)
+              POST-ACCUM s.@similarity = s.@numerator/sqrt(@@norm1 * s.@norm2)
+              ORDER BY s.@similarity DESC
+              LIMIT topK;
+
+        #predict label
+        kNN = SELECT s
+              FROM kNN:s
+              ACCUM @@count += (s.known_label -> 1);
+
+        FOREACH (label, cnt) IN @@count DO
+            IF cnt > max_count THEN
+                max_count = cnt;
+                predicted_label = label;
+            END;
+        END;
+
+        PRINT predicted_label;
+        RETURN predicted_label;
+
+}
+
+CREATE QUERY knn_cosine_all_attr (INT topK) FOR GRAPH movie {
+/* This query is k-nearest neighbors based on Cosine Similarity on all vertices.
+   The output is the predicted label for all the vertices depending on the majority label of their k-nearest neighbors.
+*/
+        SumAccum<STRING> @predicted_label;
+        
+        source = {Person.*};        
+        source = SELECT s
+                 FROM source:s 
+                 WHERE s.known_label == ""
+                 POST-ACCUM s.predicted_label = knn_cosine_ss(s, topK);
+  
+}
@@ -0,0 +1,59 @@
+CREATE QUERY knn_cosine_ss_file (VERTEX source, INT topK) FOR GRAPH movie RETURNS (STRING) {
+/* This subquery is k-nearest neighbors based on Cosine Similarity between a given vertex and every other vertex.
+Cosine similarity = A \dot B / ||A|| \dot ||B||
+*/
+        SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
+        MapAccum<STRING, INT> @@count;
+        INT max_count = 0;
+        STRING predicted_label;
+
+        # calculate similarity and find the top k nearest neighbors
+        start = {source};
+        subjects = SELECT t
+                   FROM start:s -((Likes):e)-> :t
+                   ACCUM t.@numerator = e.weight,
+                         @@norm1 += pow(e.weight, 2);
+
+        neighbours = SELECT t
+                     FROM subjects:s -(Reverse_Likes:e)-> Person:t
+                     WHERE t != source AND t.known_label != ""    # only consider the ones with known label
+                     ACCUM t.@numerator += s.@numerator * e.weight;
+
+        kNN = SELECT s
+              FROM neighbours:s -((Likes):e)-> :t
+              ACCUM s.@norm2 += pow(e.weight, 2)
+              POST-ACCUM s.@similarity = s.@numerator/sqrt(@@norm1 * s.@norm2)
+              ORDER BY s.@similarity DESC
+              LIMIT topK;
+
+        #predict label
+        kNN = SELECT s
+              FROM kNN:s
+              ACCUM @@count += (s.known_label -> 1);
+
+        FOREACH (label, cnt) IN @@count DO
+            IF cnt > max_count THEN
+                max_count = cnt;
+                predicted_label = label;
+            END;
+        END;
+
+        PRINT predicted_label;
+        RETURN predicted_label;
+
+}
+
+      CREATE QUERY knn_cosine_all_file (INT topK, FILE f) FOR GRAPH movie {
+/* This query is k-nearest neighbors based on Cosine Similarity on all vertices.
+   The output is the predicted label for all the vertices depending on the majority label of their k-nearest neighbors.
+*/
+        SumAccum<STRING> @predicted_label;
+        
+        source = {Person.*};        
+        source = SELECT s
+                 FROM source:s 
+                 WHERE s.known_label == ""
+                 ACCUM s.@predicted_label = knn_cosine_ss(s, topK)
+                 POST-ACCUM f.println(s, s.@predicted_label);
+  
+}
@@ -0,0 +1,100 @@
+CREATE QUERY knn_cosine_cv_sub (VERTEX source, INT max_k) FOR GRAPH movie RETURNS (ListAccum<STRING>) {
+/* This subquery returns a list of predicted label for a source vertex with respect to different k within a given range. 
+*/ 
+        TYPEDEF TUPLE <label STRING, similarity FLOAT> Label_Score;
+        HeapAccum<Label_Score>(max_k, similarity DESC) @@top_labels_heap;  # heap stores the (label, similarity) tuple, order by similarity score
+        SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
+        MapAccum<STRING, INT> @@count;
+        ListAccum<STRING> @@predicted_label_lists;  # list of predicted labels to return
+        INT max_count = 0;
+        STRING predicted_label;   # predicted label in each iteration
+        INT k;
+
+        # calculate similarity and find the top k nearest neighbors
+        start = {source};
+        subjects = SELECT t
+                   FROM start:s -((Likes):e)-> :t
+                   ACCUM t.@numerator = e.weight,
+                         @@norm1 += pow(e.weight, 2);
+
+        neighbours = SELECT t
+                     FROM subjects:s -(Reverse_Likes:e)-> :t
+                     WHERE t != source AND t.known_label != ""    # only consider the neighbors with known label
+                     ACCUM t.@numerator += s.@numerator * e.weight;
+
+        kNN = SELECT s
+              FROM neighbours:s -((Likes):e)-> :t
+              ACCUM s.@norm2 += pow(e.weight, 2)
+              POST-ACCUM @@top_labels_heap += Label_Score(s.known_label, s.@numerator/sqrt(@@norm1 * s.@norm2)); # store the label and similarity score in a heap 
+
+	# iterate the heap and calculate label count for different k
+        k = 1;
+        FOREACH item IN @@top_labels_heap DO  
+                @@count += (item.label -> 1);   # count is a map, key is the label, value is the count of the label
+                IF @@count.get(item.label) > max_count THEN
+                         max_count = @@count.get(item.label);
+                         predicted_label = item.label;
+                END;
+		@@predicted_label_lists += predicted_label;  # list of predicted labels
+                k = k+1;
+        END;
+      
+        PRINT @@predicted_label_lists;
+        RETURN @@predicted_label_lists;
+}
+
+
+CREATE QUERY knn_cosine_cv (INT min_k, INT max_k) FOR GRAPH movie RETURNS (INT){
+/* Leave-one-out cross validation for selecting optimal k. 
+   The input is a range of k, output is the k with highest correct prediction rate.
+   Note: When one vertex has no neighbor with known label, the prediction is considered false
+*/
+        ListAccum<FLOAT> @@correct_rate_list; 
+        ListAccum<INT> @is_correct_list; 
+        ListAccum<STRING> @predicted_label_list;
+        SumAccum<FLOAT> @@total_score;
+        INT n, k, best_k=1;
+        FLOAT max_rate=0;
+  
+        IF max_k < min_k OR max_k < 1 THEN  // terminate if the range is invalid
+                RETURN 0;
+        END;
+        start = {Person.*};
+  
+        start = SELECT s
+                FROM start:s 
+                WHERE s.known_label != ""  // get the vertices with known label
+                ACCUM s.@predicted_label_list = knn_cosine_cv_sub(s, max_k)  // get a list of predicted label wrt different k
+                POST-ACCUM FOREACH label IN s.@predicted_label_list DO
+                                   IF s.known_label == label THEN  # *vStrAttrOld*  means no neighbor with label
+                                           s.@is_correct_list += 1
+                                   ELSE
+                                           s.@is_correct_list += 0
+                                   END                   
+                           END;
+  
+	n = start.size();
+        k = min_k-1;  # index starts from 0
+        WHILE k < max_k DO
+                @@total_score = 0;
+                start = SELECT s
+                        FROM start:s 
+                        ACCUM IF s.@is_correct_list.size()==0 THEN
+                                      @@total_score += 0  # if there is no neighbor, it is considered incorrect prediction
+                              ELSE IF k >= s.@is_correct_list.size() THEN
+                                      @@total_score += s.@is_correct_list.get(s.@is_correct_list.size()-1)   # use all neighbors it has when it is not enough  
+                              ELSE 
+                                      @@total_score += s.@is_correct_list.get(k)
+                              END;
+                @@correct_rate_list += @@total_score / n;
+                IF @@total_score / n > max_rate THEN
+                        max_rate = @@total_score / n;  # store the max correct rate in max_rate
+                        best_k = k+1;
+                END;
+                k = k+1;
+        END;
+
+        PRINT @@correct_rate_list;
+        PRINT best_k;
+        RETURN best_k;
+}
@@ -4,24 +4,24 @@ Cosine similarity = A \dot B / ||A|| \dot ||B||
 The output is the predicted label for the source vertex, which is the majority label of its k-nearest neighbors. 
 */
         SumAccum<FLOAT> @numerator, @@norm1, @norm2, @similarity;
-        MapAccum<STRING, INT> @@mapCountLabels;
-        INT maxCount = 0;
-        STRING chosenLabel;
+        MapAccum<STRING, INT> @@labels_count_map;
+        INT max_count = 0;
+        STRING predicted_label;
 
         # calculate similarity and find the top k nearest neighbors
-        Start = {source};
-        Subjects = SELECT t
-                   FROM Start:s -(Likes:e)-> :t
+        start = {source};
+        subjects = SELECT t
+                   FROM start:s -((Likes):e)-> :t
                    ACCUM t.@numerator = e.weight,
@@norm1 += pow(e.weight, 2);
 
-        Neighbours = SELECT t
-                     FROM Subjects:s -(Reverse_Likes:e)-> Person:t
+        neighbours = SELECT t
+                     FROM subjects:s -(Reverse_Likes:e)-> Person:t
                      WHERE t != source AND t.known_label != ""    # only consider the neighbours with known label
                      ACCUM t.@numerator += s.@numerator * e.weight;
 
         kNN = SELECT s
-              FROM Neighbours:s -(Likes:e)-> :t
+              FROM neighbours:s -((Likes):e)-> :t
               ACCUM s.@norm2 += pow(e.weight, 2)
               POST-ACCUM s.@similarity = s.@numerator/sqrt(@@norm1 * s.@norm2)
               ORDER BY s.@similarity DESC
@@ -30,15 +30,15 @@ The output is the predicted label for the source vertex, which is the majority l
         #predict label
         kNN = SELECT s
               FROM kNN:s
-              ACCUM @@mapCountLabels += (s.known_label -> 1);
+              ACCUM @@labels_count_map += (s.known_label -> 1);
 
-        FOREACH (label, cnt) IN @@mapCountLabels DO
-            IF cnt > maxCount THEN
-                maxCount = cnt;
-                chosenLabel = label;
+        FOREACH (label, cnt) IN @@labels_count_map DO
+            IF cnt > max_count THEN
+                max_count = cnt;
+                predicted_label = label;
             END;
         END;
 
-        PRINT chosenLabel;
-        RETURN chosenLabel;
+        PRINT predicted_label;
+        RETURN predicted_label;
 }