Skip to content

Commit 8ba4957

Browse files
committed
In readme, added link to javadoc
1 parent fe4c606 commit 8ba4957

File tree

5 files changed

+74
-7
lines changed

5 files changed

+74
-7
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Using maven:
2525
</dependency>
2626
```
2727

28-
See [releases](https://github.com/tdebatty/java-string-similarity/releases).
28+
Or check the [releases](https://github.com/tdebatty/java-string-similarity/releases).
2929

3030
## Interfaces
3131
Although the topic might seem simple, a lot of different algorithms exist to measure text similarity or distance. Therefore the library defines some interfaces to categorize them.
@@ -42,6 +42,8 @@ Generally, algorithms that implement NormalizedStringSimilarity also implement N
4242
### Metric distances
4343
The MetricStringDistance interface : A few of the distances are actually metric distances, which means that verify the triangle inequality d(x, y) <= d(x,z) + d(z,y). For example, Levenshtein is a metric distance, but NormalizedLevenshtein is not.
4444

45+
[Read Javadoc for details](http://api123.web-d.be/api/java-string-similarity/head/index.html)
46+
4547
## Shingles (n-gram) based similarity and distance
4648
A few algorithms work by converting strings into sets of n-grams (sequences of n characters, also sometimes called k-shingles). The similarity or distance between the strings is then the similarity or distance between the sets.
4749

src/main/java/info/debatty/java/stringsimilarity/KShingling.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import java.util.HashMap;
88
import java.util.Iterator;
99
import java.util.List;
10+
import java.util.Map;
1011
import java.util.regex.Pattern;
1112

1213
/**
@@ -190,8 +191,14 @@ private HashMap<Integer, Integer> getHashProfile(String s) {
190191

191192
return hash_profile;
192193
}
193-
194-
195-
196-
194+
195+
String getNGram(int key) {
196+
for (Map.Entry<String, Integer> entry : shingles.entrySet()) {
197+
if (entry.getValue().equals(key)) {
198+
return entry.getKey();
199+
}
200+
}
201+
202+
throw new InvalidParameterException("No ngram coresponds to key " + key);
203+
}
197204
}

src/main/java/info/debatty/java/stringsimilarity/StringProfile.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,22 @@
3232
* @author Thibault Debatty
3333
*/
3434
public class StringProfile {
35+
36+
public static void main(String[] args) {
37+
KShingling ks = new KShingling(2);
38+
for (String ngram : ks.getProfile("ABCABC").getMostFrequentNGrams(2)) {
39+
System.out.println(ngram);
40+
}
41+
42+
for (String ngram : ks.getProfile("A").getMostFrequentNGrams(2)) {
43+
System.out.println(ngram);
44+
}
45+
46+
for (String ngram : ks.getProfile("This is a string...").getMostFrequentNGrams(2)) {
47+
System.out.println(ngram);
48+
}
49+
}
50+
3551
private final SparseIntegerVector vector;
3652
private final KShingling ks;
3753

@@ -71,4 +87,39 @@ public double qgramDistance(StringProfile other) throws Exception {
7187
public SparseIntegerVector getSparseVector() {
7288
return this.vector;
7389
}
90+
91+
public String[] getMostFrequentNGrams(int number) {
92+
String[] strings = new String[number];
93+
int[] frequencies = new int[number];
94+
95+
int position_smallest_frequency = 0;
96+
97+
for (int i = 0; i < vector.size(); i++) {
98+
int key = vector.getKey(i);
99+
int frequency = vector.getValue(i);
100+
String ngram = ks.getNGram(key);
101+
102+
if (frequency > frequencies[position_smallest_frequency]) {
103+
// 1. replace the element with currently the smallest frequency
104+
strings[position_smallest_frequency] = ngram;
105+
frequencies[position_smallest_frequency] = frequency;
106+
107+
// 2. loop over frequencies to find which one is now the lowest
108+
// frequency
109+
int smallest_frequency = Integer.MAX_VALUE;
110+
for (int j = 0; j < frequencies.length; j++) {
111+
if (frequencies[j] < smallest_frequency) {
112+
position_smallest_frequency = j;
113+
smallest_frequency = frequencies[j];
114+
}
115+
}
116+
117+
}
118+
119+
}
120+
121+
return strings;
122+
123+
124+
}
74125
}

src/main/java/info/debatty/java/stringsimilarity/examples/MetricLCS.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,5 @@ public static void main(String[] args) {
4646
// longest = ABDEF => length = 5
4747
// => 1 - 4 / 5 = 0.2
4848
System.out.println(lcs.distance("ABDEF", "ABDIF"));
49-
}
50-
49+
}
5150
}

src/main/java/info/debatty/java/utils/SparseIntegerVector.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,4 +262,12 @@ public double qgram(SparseIntegerVector other) {
262262
public int size() {
263263
return this.size;
264264
}
265+
266+
public int getKey(int i) {
267+
return this.keys[i];
268+
}
269+
270+
public int getValue(int i) {
271+
return this.values[i];
272+
}
265273
}

0 commit comments

Comments
 (0)