diff --git a/changelog.txt b/changelog.txt new file mode 100644 index 0000000..a78b993 --- /dev/null +++ b/changelog.txt @@ -0,0 +1,15 @@ +This is a fork of the project Optimaize Language Detector https://github.com/optimaize/language-detector + +License Apache 2.0 remains the same + +Depending on the changes the project might be rename in the future to establish a clear difference. + +Original version 0.7-SNAPSHOT + +02/09/2025 0.8-SNAPSHOT + + * pgrading dependencies to address security vulnerabilities reported in Guava, Logback. + * Compiling with Java 11 + * Upgrading testing dependencies + * Gradually replacing the use of Guava classes with built-in JDK alternatives. + * Replacing cobertura with Jacoco as the former does not support Java 11 \ No newline at end of file diff --git a/pom.xml b/pom.xml index d30f4ed..ab0f660 100644 --- a/pom.xml +++ b/pom.xml @@ -5,10 +5,10 @@ com.optimaize.languagedetector language-detector language-detector - 0.7-SNAPSHOT + 0.8-SNAPSHOT jar - https://github.com/optimaize/language-detector + https://github.com/hsolano1/language-detector Language Detection Library for Java. @@ -23,8 +23,8 @@ UTF-8 - 1.7 - 1.7 + 11 + 11 @@ -40,12 +40,15 @@ Robert Theis + + Humberto Solano + - scm:git:https://github.com/optimaize/language-detector - scm:git:https://github.com/optimaize/language-detector - https://github.com/optimaize/language-detector + scm:git:https://github.com/hsolano1/language-detector + scm:git:https://github.com/hsolano1/language-detector + https://github.com/hsolano1/language-detector HEAD @@ -92,31 +95,30 @@ - - org.apache.maven.plugins - maven-javadoc-plugin - 2.9.1 + org.jacoco + jacoco-maven-plugin + 0.8.12 - attach-javadoc - verify - jar + prepare-agent + + + + + report + test + + report - - - org.codehaus.mojo - cobertura-maven-plugin - 2.6 - org.apache.maven.plugins maven-site-plugin - 3.3 + 3.21.0 org.apache.maven.plugins @@ -228,12 +230,12 @@ com.google.guava guava - 19.0 + 33.4.0-jre org.slf4j slf4j-api - 1.7.6 + 1.7.36 @@ -248,32 +250,32 @@ junit - junit-dep - 4.11 + junit + 4.13.2 test org.hamcrest hamcrest-core - 1.3 + 3.0 test org.hamcrest hamcrest-library - 1.3 + 3.0 test org.mockito - mockito-all - 1.9.5 + mockito-core + 5.15.2 test ch.qos.logback logback-classic - 1.1.1 + 1.5.16 test diff --git a/src/main/java/com/optimaize/langdetect/LanguageDetector.java b/src/main/java/com/optimaize/langdetect/LanguageDetector.java index e60976c..1b8e551 100644 --- a/src/main/java/com/optimaize/langdetect/LanguageDetector.java +++ b/src/main/java/com/optimaize/langdetect/LanguageDetector.java @@ -16,7 +16,7 @@ package com.optimaize.langdetect; -import com.google.common.base.Optional; +import java.util.Optional; import com.optimaize.langdetect.i18n.LdLocale; import java.util.List; diff --git a/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java b/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java index 4f26801..5cb63f0 100644 --- a/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java +++ b/src/main/java/com/optimaize/langdetect/LanguageDetectorBuilder.java @@ -16,7 +16,6 @@ package com.optimaize.langdetect; -import com.google.common.base.Optional; import com.optimaize.langdetect.i18n.LdLocale; import com.optimaize.langdetect.ngram.NgramExtractor; import com.optimaize.langdetect.profiles.LanguageProfile; @@ -25,6 +24,7 @@ import java.util.HashSet; import java.util.Map; +import java.util.Optional; import java.util.Set; @@ -43,7 +43,7 @@ public class LanguageDetectorBuilder { private final NgramExtractor ngramExtractor; private double alpha = ALPHA_DEFAULT; - private Optional seed = Optional.absent(); + private Optional seed = Optional.empty(); private int shortTextAlgorithm = 50; private double prefixFactor = 1.0d; private double suffixFactor = 1.0d; diff --git a/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java b/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java index 4402a58..6dbf646 100644 --- a/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java +++ b/src/main/java/com/optimaize/langdetect/LanguageDetectorImpl.java @@ -17,7 +17,6 @@ package com.optimaize.langdetect; import com.optimaize.langdetect.cybozu.util.Util; -import com.google.common.base.Optional; import com.optimaize.langdetect.i18n.LdLocale; import com.optimaize.langdetect.ngram.NgramExtractor; import org.jetbrains.annotations.NotNull; @@ -25,7 +24,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Random; /** * @@ -135,13 +140,13 @@ public int compare(DetectedLanguage a, DetectedLanguage b) { public Optional detect(CharSequence text) { List probabilities = getProbabilities(text); if (probabilities.isEmpty()) { - return Optional.absent(); + return Optional.empty(); } else { DetectedLanguage best = probabilities.get(0); if (best.getProbability() >= minimalConfidence) { return Optional.of(best.getLocale()); } else { - return Optional.absent(); + return Optional.empty(); } } } @@ -194,7 +199,7 @@ private double[] detectBlockShortText(Map ngrams) { private double[] detectBlockLongText(List ngrams) { assert !ngrams.isEmpty(); double[] langprob = new double[ngramFrequencyData.getLanguageList().size()]; - Random rand = new Random(seed.or(DEFAULT_SEED)); + Random rand = new Random(seed.orElse(DEFAULT_SEED)); for (int t = 0; t < N_TRIAL; ++t) { double[] prob = initProbability(); double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH); diff --git a/src/main/java/com/optimaize/langdetect/cybozu/CommandLineInterface.java b/src/main/java/com/optimaize/langdetect/cybozu/CommandLineInterface.java index a697f35..0881aa1 100644 --- a/src/main/java/com/optimaize/langdetect/cybozu/CommandLineInterface.java +++ b/src/main/java/com/optimaize/langdetect/cybozu/CommandLineInterface.java @@ -18,7 +18,6 @@ import com.optimaize.langdetect.frma.LangProfileWriter; import com.optimaize.langdetect.cybozu.util.LangProfile; -import com.google.common.base.Optional; import com.optimaize.langdetect.DetectedLanguage; import com.optimaize.langdetect.LanguageDetector; import com.optimaize.langdetect.LanguageDetectorBuilder; @@ -32,8 +31,20 @@ import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import java.io.*; -import java.util.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; /** * LangDetect Command Line Interface. @@ -286,7 +297,7 @@ public void batchTest() throws IOException { private LanguageDetector makeDetector() throws IOException { double alpha = getParamDouble("alpha", DEFAULT_ALPHA); String profileDirectory = requireParamString("directory") + "/"; - Optional seed = Optional.fromNullable(getParamLongOrNull("seed")); + Optional seed = Optional.ofNullable(getParamLongOrNull("seed")); List languageProfiles = new LanguageProfileReader().readAll(new File(profileDirectory)); diff --git a/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java b/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java index c084f7b..7886144 100644 --- a/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java +++ b/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java @@ -16,11 +16,11 @@ package com.optimaize.langdetect.i18n; -import com.google.common.base.Optional; import com.google.common.base.Splitter; import org.jetbrains.annotations.NotNull; import java.util.List; +import java.util.Optional; /** * A language-detector implementation of a Locale, similar to the java.util.Locale. @@ -106,8 +106,8 @@ public static LdLocale fromString(@NotNull String string) { } } assert language != null; - if (script==null) script = Optional.absent(); - if (region==null) region = Optional.absent(); + if (script==null) script = Optional.empty(); + if (region==null) region = Optional.empty(); return new LdLocale(language, script, region); } diff --git a/src/main/java/com/optimaize/langdetect/ngram/NgramExtractor.java b/src/main/java/com/optimaize/langdetect/ngram/NgramExtractor.java index 09c6beb..0e56c32 100644 --- a/src/main/java/com/optimaize/langdetect/ngram/NgramExtractor.java +++ b/src/main/java/com/optimaize/langdetect/ngram/NgramExtractor.java @@ -16,7 +16,6 @@ package com.optimaize.langdetect.ngram; -import com.google.common.collect.ImmutableList; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -39,7 +38,7 @@ public class NgramExtractor { private final Character textPadding; public static NgramExtractor gramLength(int gramLength) { - return new NgramExtractor(ImmutableList.of(gramLength), null, null); + return new NgramExtractor(List.of(gramLength), null, null); } public static NgramExtractor gramLengths(Integer... gramLength) { return new NgramExtractor(Arrays.asList(gramLength), null, null); @@ -65,7 +64,7 @@ public NgramExtractor textPadding(char textPadding) { private NgramExtractor(@NotNull List gramLengths, @Nullable NgramFilter filter, @Nullable Character textPadding) { if (gramLengths.isEmpty()) throw new IllegalArgumentException(); - this.gramLengths = ImmutableList.copyOf(gramLengths); + this.gramLengths = List.copyOf(gramLengths); this.filter = filter; this.textPadding = textPadding; } diff --git a/src/main/java/com/optimaize/langdetect/profiles/BuiltInLanguages.java b/src/main/java/com/optimaize/langdetect/profiles/BuiltInLanguages.java index e344e68..e71adfe 100644 --- a/src/main/java/com/optimaize/langdetect/profiles/BuiltInLanguages.java +++ b/src/main/java/com/optimaize/langdetect/profiles/BuiltInLanguages.java @@ -16,7 +16,6 @@ package com.optimaize.langdetect.profiles; -import com.google.common.collect.ImmutableList; import com.optimaize.langdetect.i18n.LdLocale; import java.util.ArrayList; @@ -106,7 +105,7 @@ public class BuiltInLanguages { names.add(LdLocale.fromString("zh-CN")); names.add(LdLocale.fromString("zh-TW")); - languages = ImmutableList.copyOf(names); + languages = List.copyOf(names); } static { @@ -128,7 +127,7 @@ public class BuiltInLanguages { texts.add("sv"); texts.add("tr"); texts.add("vi"); - shortTextLanguages = ImmutableList.copyOf(texts); + shortTextLanguages = List.copyOf(texts); } /** diff --git a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java index c8f3c29..7914972 100644 --- a/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java +++ b/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java @@ -16,12 +16,15 @@ package com.optimaize.langdetect.profiles; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import com.optimaize.langdetect.i18n.LdLocale; import org.jetbrains.annotations.NotNull; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** *

This class is immutable.

@@ -63,9 +66,9 @@ private static class Stats { public Stats(@NotNull Map numOccurrences, @NotNull Map minGramCounts, @NotNull Map maxGramCounts) { - this.numOccurrences = ImmutableMap.copyOf(numOccurrences); - this.minGramCounts = ImmutableMap.copyOf(minGramCounts); - this.maxGramCounts = ImmutableMap.copyOf(maxGramCounts); + this.numOccurrences = Map.copyOf(numOccurrences); + this.minGramCounts = Map.copyOf(minGramCounts); + this.maxGramCounts = Map.copyOf(maxGramCounts); } } @@ -76,7 +79,7 @@ public Stats(@NotNull Map numOccurrences, LanguageProfileImpl(@NotNull LdLocale locale, @NotNull Map> ngrams) { this.locale = locale; - this.ngrams = ImmutableMap.copyOf(ngrams); + this.ngrams = Map.copyOf(ngrams); this.stats = makeStats(ngrams); } diff --git a/src/main/java/com/optimaize/langdetect/profiles/util/LanguageProfileValidator.java b/src/main/java/com/optimaize/langdetect/profiles/util/LanguageProfileValidator.java index 1f122f1..42fed9f 100644 --- a/src/main/java/com/optimaize/langdetect/profiles/util/LanguageProfileValidator.java +++ b/src/main/java/com/optimaize/langdetect/profiles/util/LanguageProfileValidator.java @@ -31,12 +31,10 @@ import com.optimaize.langdetect.text.TextObjectFactory; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.NoSuchElementException; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; /** * Performs k-fold cross-validation. @@ -57,7 +55,7 @@ public class LanguageProfileValidator { /** * All loaded language profiles. */ - private final List languageProfiles = new ArrayList<>(); + private /*final*/ List languageProfiles = new ArrayList<>(); private LanguageProfileBuilder languageProfileBuilder; private TextObject inputSample; @@ -175,11 +173,11 @@ public double validate() { List detectedLanguages = languageDetector.getProbabilities(testSample); try{ - DetectedLanguage kResult = Iterables.find(detectedLanguages, new Predicate() { - public boolean apply(DetectedLanguage language) { - return language.getLocale().getLanguage().equals(languageProfile.getLocale().getLanguage()); - } - }); + DetectedLanguage kResult = detectedLanguages + .stream() + .filter(language -> Objects.equals(language.getLocale().getLanguage(), languageProfile.getLocale().getLanguage())) + .findFirst() + .orElseThrow(); probabilities.add(kResult.getProbability()); System.out.println("Probability: " + kResult.getProbability()); diff --git a/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java b/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java index dd32d77..8e74c02 100644 --- a/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java +++ b/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java @@ -16,7 +16,6 @@ package com.optimaize.langdetect.text; -import com.google.common.collect.ImmutableList; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -39,7 +38,7 @@ public MultiTextFilter(@NotNull List filters) { if (filters.isEmpty()) { this.filters = null; } else { - this.filters = ImmutableList.copyOf(filters); + this.filters = List.copyOf(filters); } } diff --git a/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java b/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java index df1f3da..20ffe25 100644 --- a/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java +++ b/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java @@ -18,17 +18,18 @@ import com.optimaize.langdetect.frma.LangProfileReader; import com.optimaize.langdetect.cybozu.util.LangProfile; -import com.google.common.collect.ImmutableList; import com.optimaize.langdetect.ngram.NgramExtractors; import com.optimaize.langdetect.profiles.LanguageProfile; import com.optimaize.langdetect.profiles.OldLangProfileConverter; -import com.optimaize.langdetect.text.*; +import com.optimaize.langdetect.text.CommonTextObjectFactories; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.IOException; import java.util.List; -import static org.testng.Assert.*; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; /** @@ -63,7 +64,7 @@ private LanguageDetector makeNewDetector() throws IOException { .suffixFactor(2.0); LangProfileReader langProfileReader = new LangProfileReader(); - for (String language : ImmutableList.of("en", "fr", "nl", "de")) { + for (String language : List.of("en", "fr", "nl", "de")) { LangProfile langProfile = langProfileReader.read(LanguageDetectorImplTest.class.getResourceAsStream("/languages/" + language)); LanguageProfile languageProfile = OldLangProfileConverter.convert(langProfile); builder.withProfile(languageProfile); diff --git a/src/test/java/com/optimaize/langdetect/NgramFrequencyDataTest.java b/src/test/java/com/optimaize/langdetect/NgramFrequencyDataTest.java index 3f99380..9ce2f6a 100644 --- a/src/test/java/com/optimaize/langdetect/NgramFrequencyDataTest.java +++ b/src/test/java/com/optimaize/langdetect/NgramFrequencyDataTest.java @@ -16,7 +16,6 @@ package com.optimaize.langdetect; -import com.google.common.collect.ImmutableSet; import com.optimaize.langdetect.i18n.LdLocale; import com.optimaize.langdetect.profiles.LanguageProfile; import com.optimaize.langdetect.profiles.LanguageProfileReader; @@ -25,6 +24,7 @@ import java.io.IOException; import java.util.List; +import java.util.Set; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -44,7 +44,7 @@ public static void init() throws IOException { } private static NgramFrequencyData forAll(int gramSize) throws IOException { List languageProfiles = new LanguageProfileReader().readAllBuiltIn(); - return NgramFrequencyData.create(languageProfiles, ImmutableSet.of(gramSize)); + return NgramFrequencyData.create(languageProfiles, Set.of(gramSize)); } diff --git a/src/test/java/com/optimaize/langdetect/profiles/LanguageProfileReaderTest.java b/src/test/java/com/optimaize/langdetect/profiles/LanguageProfileReaderTest.java index 3396946..d27230b 100644 --- a/src/test/java/com/optimaize/langdetect/profiles/LanguageProfileReaderTest.java +++ b/src/test/java/com/optimaize/langdetect/profiles/LanguageProfileReaderTest.java @@ -16,7 +16,6 @@ package com.optimaize.langdetect.profiles; -import com.google.common.collect.ImmutableList; import com.optimaize.langdetect.i18n.LdLocale; import org.junit.Test; @@ -68,7 +67,7 @@ private static void checkProfileFile(String language, int nWordSize, int freqSiz assertThat(languageProfile, is(notNullValue())); assertThat(languageProfile.getLocale().getLanguage(), is(equalTo(language))); assertEquals(languageProfile.getGramLengths().size(), nWordSize); - assertEquals(languageProfile.getGramLengths(), ImmutableList.of(1, 2, 3)); + assertEquals(languageProfile.getGramLengths(), List.of(1, 2, 3)); assertEquals(languageProfile.getNumGrams(), freqSize); assertTrue(languageProfile.getMinGramCount(nWordSize) < languageProfile.getMaxGramCount(nWordSize)); @@ -79,7 +78,7 @@ private static void checkProfileFile(String language, int nWordSize, int freqSiz @Test public void readFromDir() throws IOException { - List read = new LanguageProfileReader().read(ImmutableList.of("de", "fr")); + List read = new LanguageProfileReader().read(List.of("de", "fr")); assertEquals(read.size(), 2); } @@ -88,7 +87,7 @@ public void readFromDirWithClassloader() throws IOException { List read = new LanguageProfileReader().read( LanguageProfileReaderTest.class.getClassLoader(), "languages", - ImmutableList.of("de", "fr") + List.of("de", "fr") ); assertEquals(read.size(), 2); } @@ -96,19 +95,19 @@ public void readFromDirWithClassloader() throws IOException { @Test public void read() throws IOException { - List read = new LanguageProfileReader().read(ImmutableList.of("de", "fr")); + List read = new LanguageProfileReader().read(List.of("de", "fr")); assertEquals(read.size(), 2); } @Test public void read_folder() throws IOException { - List read = new LanguageProfileReader().read("languages", ImmutableList.of("de", "fr")); + List read = new LanguageProfileReader().read("languages", List.of("de", "fr")); assertEquals(read.size(), 2); } @Test public void read_classpathAndFolder() throws IOException { - List read = new LanguageProfileReader().read(LanguageProfileReaderTest.class.getClassLoader(), "languages", ImmutableList.of("de", "fr")); + List read = new LanguageProfileReader().read(LanguageProfileReaderTest.class.getClassLoader(), "languages", List.of("de", "fr")); assertEquals(read.size(), 2); } @@ -131,7 +130,7 @@ private void verify_readAllBuiltIn(List profiles) { @Test public void loadProfilesFromClasspath() throws IOException { - List result = new LanguageProfileReader().read(this.getClass().getClassLoader(), "languages", ImmutableList.of("en", "fr", "nl", "de")); + List result = new LanguageProfileReader().read(this.getClass().getClassLoader(), "languages", List.of("en", "fr", "nl", "de")); assertEquals(result.size(), 4); } diff --git a/src/test/java/com/optimaize/langdetect/text/MultiTextFilterTest.java b/src/test/java/com/optimaize/langdetect/text/MultiTextFilterTest.java index ae59611..9e4a660 100644 --- a/src/test/java/com/optimaize/langdetect/text/MultiTextFilterTest.java +++ b/src/test/java/com/optimaize/langdetect/text/MultiTextFilterTest.java @@ -16,10 +16,10 @@ package com.optimaize.langdetect.text; -import com.google.common.collect.ImmutableList; import org.junit.Test; import java.util.Collections; +import java.util.List; import static org.junit.Assert.assertEquals; @@ -35,7 +35,7 @@ public void empty() throws Exception { @Test public void doubleFilter() throws Exception { - assertEquals(new MultiTextFilter(ImmutableList.of( + assertEquals(new MultiTextFilter(List.of( new TextFilter() { @Override public String filter(CharSequence text) { diff --git a/src/test/resources/logback-test.xml b/src/test/resources/logback-test.xml index f4e8ff9..6c83ad0 100644 --- a/src/test/resources/logback-test.xml +++ b/src/test/resources/logback-test.xml @@ -1,21 +1,17 @@ - - - System.out - - %d{yyyy-MM-dd/HH:mm:ss.SSS/zzz} [%t] %-5p %m%n + + + %d{yyyy.MM.dd HH:mm:ss.SSS} Thread[%thread] %logger{20} - %msg%n + UTF-8 - - INFO - - + \ No newline at end of file