added documentation

tdebatty · tdebatty · commit 560d86120307 · 2015-10-01T15:31:29.000+02:00
diff --git a/README.md b/README.md
@@ -64,14 +64,16 @@ The main characteristics of each implemented algorithm are presented below. The
 | Weighted Levenshtein 				|distance 				| No 			| No 		| 	      | O(m.n) |
 | Damerau-Levenshtein 				|distance 				| No 			| No 		| 	      | O(m.n) |
 | Jaro-Winkler 						|similarity<br>distance	| Yes  			| No 		| 	      | O(m.n) |
-| Longest Common Subsequence 		|distance 				| No 			| No 		| 	      | O(m.n) |
+| Longest Common Subsequence 		|distance 				| No 			| No 		| 	      | O(m.n)* |
 | Metric Longest Common Subsequence |distance   			| Yes 			| No  		| 	      | O(m.n) |
 | N-Gram (Kondrak)		 			|distance				| Yes  			| No 		| 	      | O(m.n) |
 | Q-Gram 							|distance  			 	| No  			| No 		| Profile | O(m+n) |
 | Cosine 							|similarity<br>distance | Yes  			| No  		| Profile | O(m+n) |
 | Jaccard 							|similarity<br>distance | Yes  			| Yes  		| Set	  | O(m+n) |
 | Sorensen-Dice 					|similarity<br>distance | Yes 			| No 		| Set	  | O(m+n) |
 
+\* In "Length of Maximal Common Subsequences", K.S. Larsen proposed an algorithm that computes the length of LCS in time O(log(m).log(n)). But the algorithm has a memory requirement O(m.n²) and was thus not implemented here.
+
 ## Levenshtein
 The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.
 
@@ -233,7 +235,9 @@ max = n + m
 
 LCS distance is equivalent to Levenshtein distance when only insertion and deletion is allowed (no substitution), or when the cost of the substitution is the double of the cost of an insertion or deletion.
 
-This class currently implements the dynamic programming approach, which has a space requirement O(m.n), and computation cost O (m.n)
+This class implements the dynamic programming approach, which has a space requirement O(m.n), and computation cost O(m.n).
+
+In "Length of Maximal Common Subsequences", K.S. Larsen proposed an algorithm that computes the length of LCS in time O(log(m).log(n)). But the algorithm has a memory requirement O(m.n²) and was thus not implemented here.
 
 ```java
 import info.debatty.java.stringsimilarity.*;
diff --git a/src/main/java/info/debatty/java/stringsimilarity/Cosine.java b/src/main/java/info/debatty/java/stringsimilarity/Cosine.java
@@ -28,6 +28,9 @@
 import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
 
 /**
+ * The similarity between the two strings is the cosine of the angle between 
+ * these two vectors representation. It is computed as V1 . V2 / (|V1| * |V2|)
+ * The cosine distance is computed as 1 - cosine similarity.
  * @author Thibault Debatty
  */
 public class Cosine extends ShingleBased implements 
diff --git a/src/main/java/info/debatty/java/stringsimilarity/Jaccard.java b/src/main/java/info/debatty/java/stringsimilarity/Jaccard.java
@@ -29,7 +29,13 @@
 import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
 
 /**
- * 
+ * Each input string is converted into a set of n-grams, the Jaccard index is 
+ * then computed as |V1 inter V2| / |V1 union V2|.
+ * Like Q-Gram distance, the input strings are first converted into sets of 
+ * n-grams (sequences of n characters, also called k-shingles), but this time 
+ * the cardinality of each n-gram is not taken into account. 
+ * Distance is computed as 1 - cosine similarity.
+ * Jaccard index is a metric distance.
  * @author Thibault Debatty
  */
 public class Jaccard extends ShingleBased implements 
diff --git a/src/main/java/info/debatty/java/stringsimilarity/JaroWinkler.java b/src/main/java/info/debatty/java/stringsimilarity/JaroWinkler.java
@@ -5,8 +5,15 @@
 import java.util.Arrays;
 
 /**
- *
- * @author tibo
+ * The Jaro–Winkler distance metric is designed and best suited for short 
+ * strings such as person names, and to detect typos; it is (roughly) a 
+ * variation of Damerau-Levenshtein, where the substitution of 2 close 
+ * characters is considered less important then the substitution of 2 characters
+ * that a far from each other.
+ * Jaro-Winkler was developed in the area of record linkage (duplicate 
+ * detection) (Winkler, 1990). It returns a value in the interval [0.0, 1.0].
+ * The distance is computed as 1 - Jaro-Winkler similarity.
+ * @author Thibault Debatty
  */
 public class JaroWinkler implements NormalizedStringSimilarity, NormalizedStringDistance {
     
diff --git a/src/main/java/info/debatty/java/stringsimilarity/NormalizedLevenshtein.java b/src/main/java/info/debatty/java/stringsimilarity/NormalizedLevenshtein.java
@@ -28,7 +28,10 @@
 import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
 
 /**
- *
+ * This distance is computed as levenshtein distance divided by the length of 
+ * the longest string. The resulting value is always in the interval [0.0 1.0] 
+ * but it is not a metric anymore!
+ * The similarity is computed as 1 - normalized distance.
  * @author Thibault Debatty
  */
 public class NormalizedLevenshtein implements NormalizedStringDistance, NormalizedStringSimilarity {
diff --git a/src/main/java/info/debatty/java/stringsimilarity/QGram.java b/src/main/java/info/debatty/java/stringsimilarity/QGram.java
@@ -2,9 +2,14 @@
 
 
 import info.debatty.java.stringsimilarity.interfaces.StringDistance;
-import info.debatty.java.utils.SparseIntegerVector;
 
 /**
+ * Q-gram distance, as defined by Ukkonen in "Approximate string-matching with 
+ * q-grams and maximal matches". The distance between two strings is defined as 
+ * the L1 norm of the difference of their profiles (the number of occurences of 
+ * each n-gram): SUM( |V1_i - V2_i| ). Q-gram distance is a lower bound on 
+ * Levenshtein distance, but can be computed in O(m + n), where Levenshtein 
+ * requires O(m.n).
  * @author Thibault Debatty
  */
 public class QGram extends ShingleBased implements StringDistance {
diff --git a/src/main/java/info/debatty/java/stringsimilarity/SorensenDice.java b/src/main/java/info/debatty/java/stringsimilarity/SorensenDice.java
@@ -28,7 +28,9 @@
 import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
 
 /**
- * 
+ * Similar to Jaccard index, but this time the similarity is computed as 
+ * 2 * |V1 inter V2| / (|V1| + |V2|).
+ * Distance is computed as 1 - cosine similarity.
  * @author Thibault Debatty
  */
 public class SorensenDice extends ShingleBased implements 
diff --git a/src/main/java/info/debatty/java/stringsimilarity/StringProfile.java b/src/main/java/info/debatty/java/stringsimilarity/StringProfile.java
@@ -27,7 +27,8 @@
 import info.debatty.java.utils.SparseIntegerVector;
 
 /**
- * Profile of a string, computed using shingling.
+ * Profile of a string (number of occurences of each shingle/n-gram), computed 
+ * using shingling.
  * 
  * @author Thibault Debatty
  */
@@ -59,7 +60,7 @@ public StringProfile(SparseIntegerVector vector, KShingling ks) {
     /**
      *
      * @param other
-     * @return
+     * @return cosine similarity between this string and the other
      * @throws java.lang.Exception
      */
     public double cosineSimilarity(StringProfile other) throws Exception {
@@ -73,7 +74,7 @@ public double cosineSimilarity(StringProfile other) throws Exception {
     /**
      * 
      * @param other
-     * @return
+     * @return qgram distance between this string and the other
      * @throws Exception 
      */
     public double qgramDistance(StringProfile other) throws Exception {
@@ -113,13 +114,9 @@ public String[] getMostFrequentNGrams(int number) {
                         smallest_frequency = frequencies[j];
                     }
                 }
-                
             }
             
         }
-        
         return strings;
-
-        
     }
 }
diff --git a/src/main/java/info/debatty/java/stringsimilarity/StringSet.java b/src/main/java/info/debatty/java/stringsimilarity/StringSet.java
@@ -1,7 +1,7 @@
 /*
  * The MIT License
  *
- * Copyright 2015 tibo.
+ * Copyright 2015 Thibault Debatty.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,9 @@
 import info.debatty.java.utils.SparseBooleanVector;
 
 /**
- *
- * @author tibo
+ * Set representation of a string (list of occuring shingles/n-grams), without
+ * cardinality.
+ * @author Thibault Debatty
  */
 public class StringSet {
     private final SparseBooleanVector vector;

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,8 @@`
`27`	`27`	`import info.debatty.java.utils.SparseIntegerVector;`
`28`	`28`
`29`	`29`	`/**`
`30`		`- * Profile of a string, computed using shingling.`
	`30`	`+ * Profile of a string (number of occurences of each shingle/n-gram), computed`
	`31`	`+ * using shingling.`
`31`	`32`	`*`
`32`	`33`	`* @author Thibault Debatty`
`33`	`34`	`*/`
`@@ -59,7 +60,7 @@ public StringProfile(SparseIntegerVector vector, KShingling ks) {`
`59`	`60`	`/**`
`60`	`61`	`*`
`61`	`62`	`* @param other`
`62`		`- * @return`
	`63`	`+ * @return cosine similarity between this string and the other`
`63`	`64`	`* @throws java.lang.Exception`
`64`	`65`	`*/`
`65`	`66`	`public double cosineSimilarity(StringProfile other) throws Exception {`
`@@ -73,7 +74,7 @@ public double cosineSimilarity(StringProfile other) throws Exception {`
`73`	`74`	`/**`
`74`	`75`	`*`
`75`	`76`	`* @param other`
`76`		`- * @return`
	`77`	`+ * @return qgram distance between this string and the other`
`77`	`78`	`* @throws Exception`
`78`	`79`	`*/`
`79`	`80`	`public double qgramDistance(StringProfile other) throws Exception {`
`@@ -113,13 +114,9 @@ public String[] getMostFrequentNGrams(int number) {`
`113`	`114`	`smallest_frequency = frequencies[j];`
`114`	`115`	`}`
`115`	`116`	`}`
`116`		`-`
`117`	`117`	`}`
`118`	`118`
`119`	`119`	`}`
`120`		`-`
`121`	`120`	`return strings;`
`122`		`-`
`123`		`-`
`124`	`121`	`}`
`125`	`122`	`}`