1
1
package info .debatty .java .stringsimilarity ;
2
2
3
3
import info .debatty .java .stringsimilarity .interfaces .NormalizedStringDistance ;
4
+ import net .jcip .annotations .Immutable ;
4
5
5
6
/**
6
7
* N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance",
13
14
*
14
15
* http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
15
16
*/
17
+ @ Immutable
16
18
public class NGram implements NormalizedStringDistance {
17
19
20
+ private static final int DEFAULT_N = 2 ;
18
21
private final int n ;
19
22
20
- public NGram (int n ) {
23
+ /**
24
+ * Instantiate with given value for n-gram length.
25
+ * @param n
26
+ */
27
+ public NGram (final int n ) {
21
28
this .n = n ;
22
29
}
23
30
31
+ /**
32
+ * Instantiate with default value for n-gram length (2).
33
+ */
24
34
public NGram () {
25
- this .n = 2 ;
35
+ this .n = DEFAULT_N ;
26
36
}
27
37
28
- @ Override
29
- public double distance (String s0 , String s1 ) {
38
+ /**
39
+ * Compute n-gram distance.
40
+ * @param s0
41
+ * @param s1
42
+ * @return
43
+ */
44
+ public final double distance (final String s0 , final String s1 ) {
30
45
final char special = '\n' ;
31
46
final int sl = s0 .length ();
32
47
final int tl = s1 .length ();
@@ -50,9 +65,9 @@ public double distance(String s0, String s1) {
50
65
}
51
66
52
67
char [] sa = new char [sl + n - 1 ];
53
- float p [] ; //'previous' cost array, horizontally
54
- float d [] ; // cost array, horizontally
55
- float _d [] ; //placeholder to assist in swapping p and d
68
+ float [] p ; //'previous' cost array, horizontally
69
+ float [] d ; // cost array, horizontally
70
+ float [] d2 ; //placeholder to assist in swapping p and d
56
71
57
72
//construct sa with prefix
58
73
for (int i = 0 ; i < sa .length ; i ++) {
@@ -76,7 +91,7 @@ public double distance(String s0, String s1) {
76
91
}
77
92
78
93
for (j = 1 ; j <= tl ; j ++) {
79
- //construct t_j n-gram
94
+ //construct t_j n-gram
80
95
if (j < n ) {
81
96
for (int ti = 0 ; ti < n - j ; ti ++) {
82
97
t_j [ti ] = special ; //add prefix
@@ -95,18 +110,21 @@ public double distance(String s0, String s1) {
95
110
for (int ni = 0 ; ni < n ; ni ++) {
96
111
if (sa [i - 1 + ni ] != t_j [ni ]) {
97
112
cost ++;
98
- } else if (sa [i - 1 + ni ] == special ) { //discount matches on prefix
113
+ } else if (sa [i - 1 + ni ] == special ) {
114
+ //discount matches on prefix
99
115
tn --;
100
116
}
101
117
}
102
118
float ec = (float ) cost / tn ;
103
- // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
104
- d [i ] = Math .min (Math .min (d [i - 1 ] + 1 , p [i ] + 1 ), p [i - 1 ] + ec );
119
+ // minimum of cell to the left+1, to the top+1,
120
+ // diagonally left and up +cost
121
+ d [i ] = Math .min (
122
+ Math .min (d [i - 1 ] + 1 , p [i ] + 1 ), p [i - 1 ] + ec );
105
123
}
106
124
// copy current distance counts to 'previous row' distance counts
107
- _d = p ;
125
+ d2 = p ;
108
126
p = d ;
109
- d = _d ;
127
+ d = d2 ;
110
128
}
111
129
112
130
// our last action in the above loop was to switch d and p, so p now
0 commit comments