This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch migrate_nlp-utils-sandbox-component_to_java11 in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit a398878c9ebb5c62d51ecbfb981b84e2fba9b7f0 Author: Martin Wiesner <[email protected]> AuthorDate: Thu Jan 26 21:26:20 2023 +0100 migrate sandbox component 'nlp-utils' to Java 11 - adjusts parent project (org.apache.apache) to version 18 - adjusts Java language level to 11 - cures some deprecation issues - improves existing JavaDoc --- nlp-utils/pom.xml | 25 +- .../anomalydetection/AnomalyDetectionUtils.java | 7 +- .../org/apache/opennlp/utils/cfg/CFGBuilder.java | 2 +- .../org/apache/opennlp/utils/cfg/CFGRunner.java | 265 ++++++++++----------- .../opennlp/utils/cfg/ContextFreeGrammar.java | 8 +- .../utils/cfg/ProbabilisticContextFreeGrammar.java | 22 +- .../classification/SimpleNaiveBayesClassifier.java | 27 ++- .../UpdatableSimpleNaiveBayesClassifier.java | 24 +- .../opennlp/utils/languagemodel/LanguageModel.java | 12 +- .../languagemodel/NaiveSentenceLanguageModel.java | 4 +- .../opennlp/utils/languagemodel/NoisyChannel.java | 4 +- .../TrigramSentenceLanguageModel.java | 2 +- .../org/apache/opennlp/utils/ngram/NGramUtils.java | 6 +- .../utils/regression/GradientDescentUtils.java | 2 +- .../opennlp/utils/regression/Hypothesis.java | 2 +- .../regression/LinearCombinationHypothesis.java | 2 +- .../utils/regression/RegressionModelUtils.java | 32 +-- .../java/org/apache/opennlp/utils/TestUtils.java | 8 +- .../AnomalyDetectionUtilsTest.java | 5 +- .../SimpleNaiveBayesClassifierTest.java | 6 +- .../NaiveSentenceLanguageModelTest.java | 16 +- .../TrigramSentenceLanguageModelTest.java | 16 +- .../apache/opennlp/utils/ngram/NGramUtilsTest.java | 12 +- .../utils/regression/GradientDescentUtilsTest.java | 2 +- .../utils/regression/RegressionModelUtilsTest.java | 13 +- 25 files changed, 268 insertions(+), 256 deletions(-) diff --git a/nlp-utils/pom.xml b/nlp-utils/pom.xml index 70d0df9..5a006ab 100644 --- a/nlp-utils/pom.xml +++ b/nlp-utils/pom.xml @@ -1,3 +1,4 @@ +<?xml version="1.0" encoding="UTF-8"?> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file @@ -19,12 +20,22 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> - + <parent> + <groupId>org.apache</groupId> + <artifactId>apache</artifactId> + <!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. --> + <version>18</version> + <relativePath /> + </parent> + <groupId>org.apache.opennlp</groupId> <artifactId>nlp-utils</artifactId> - <version>0.1-SNAPSHOT</version> + <version>2.1.1-SNAPSHOT</version> + <name>Apache OpenNLP Utils</name> <properties> + <maven.compiler.source>11</maven.compiler.source> + <maven.compiler.target>11</maven.compiler.target> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> @@ -32,20 +43,20 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.11</version> + <version>4.13.2</version> <scope>test</scope> </dependency> </dependencies> + <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> - <version>2.0.2</version> <configuration> - <source>1.7</source> - <target>1.7</target> - <encoding>UTF-8</encoding> + <source>${maven.compiler.source}</source> + <target>${maven.compiler.target}</target> + <compilerArgument>-Xlint</compilerArgument> </configuration> </plugin> </plugins> diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java index 0d7d4a8..009441f 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtils.java @@ -19,6 +19,8 @@ package org.apache.opennlp.utils.anomalydetection; import java.math.BigDecimal; +import java.math.RoundingMode; + import org.apache.opennlp.utils.TrainingExample; import org.apache.opennlp.utils.TrainingSet; @@ -84,9 +86,8 @@ public class AnomalyDetectionUtils { * @param x the input * @param set the training set * @return the probability of the given input - * @throws Exception */ - public static double getGaussianProbability(TrainingExample x, TrainingSet set) throws Exception { + public static double getGaussianProbability(TrainingExample x, TrainingSet set) { double[] mus = fitMus(set); double[] sigmas = fitSigmas(mus, set); return calculateGaussianProbability(x, mus, sigmas); @@ -97,7 +98,7 @@ public class AnomalyDetectionUtils { assert mus.length == sigmas.length : "parameters not aligned"; BigDecimal px = new BigDecimal(1d); for (int i = 0; i < mus.length; i++) { - BigDecimal firstTerm = BigDecimal.ONE.divide(BigDecimal.valueOf(Math.sqrt(2d * Math.PI * sigmas[i])), BigDecimal.ROUND_CEILING); + BigDecimal firstTerm = BigDecimal.ONE.divide(BigDecimal.valueOf(Math.sqrt(2d * Math.PI * sigmas[i])), RoundingMode.CEILING); BigDecimal secondTerm = BigDecimal.valueOf(Math.exp(-1 * (Math.pow(x.getInputs()[i] - mus[i], 2) / (2 * Math.pow(sigmas[i], 2))))); px = px.multiply(firstTerm.multiply(secondTerm)); } diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java index 7cca8ee..806433f 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGBuilder.java @@ -62,7 +62,7 @@ public class CFGBuilder { public ContextFreeGrammar build() { assert nonTerminalSymbols != null && terminalSymbols != null && rules != null && startSymbol != null : - "missing definitions { V : " + nonTerminalSymbols + ", ∑ : " + terminalSymbols + ", R : " + rules + ", S : " + startSymbol + "}"; + "missing definitions {V : " + nonTerminalSymbols + ", ∑ : " + terminalSymbols + ", R : " + rules + ", S : " + startSymbol + "}"; return new ContextFreeGrammar(nonTerminalSymbols, terminalSymbols, rules, startSymbol, randomExpansion); } } diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java index e3bb59b..07d93e1 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java @@ -34,147 +34,146 @@ import java.util.Map; */ public class CFGRunner { - public static void main(String[] args) throws Exception { - CFGBuilder builder = new CFGBuilder(); - - Arrays.sort(args); - boolean useWn = Arrays.binarySearch(args, "-wn") >= 0; - - Collection<String> adverbsCollection; - Collection<String> verbsCollection; - Collection<String> adjectivesCollection; - Collection<String> nounsCollection; - if (useWn) { - adverbsCollection = getTokens("/opennlp/cfg/wn/adv.txt"); - adjectivesCollection = getTokens("/opennlp/cfg/wn/adj.txt"); - nounsCollection = getTokens("/opennlp/cfg/wn/noun.txt"); - verbsCollection = getTokens("/opennlp/cfg/wn/verb.txt"); - } else { - adverbsCollection = getTokens("/opennlp/cfg/an/adv.txt"); - adjectivesCollection = getTokens("/opennlp/cfg/an/adj.txt"); - nounsCollection = getTokens("/opennlp/cfg/an/noun.txt"); - verbsCollection = getTokens("/opennlp/cfg/an/verb.txt"); - } + public static void main(String[] args) throws Exception { + CFGBuilder builder = new CFGBuilder(); + + Arrays.sort(args); + boolean useWn = Arrays.binarySearch(args, "-wn") >= 0; + + Collection<String> adverbsCollection; + Collection<String> verbsCollection; + Collection<String> adjectivesCollection; + Collection<String> nounsCollection; + if (useWn) { + adverbsCollection = getTokens("/opennlp/cfg/wn/adv.txt"); + adjectivesCollection = getTokens("/opennlp/cfg/wn/adj.txt"); + nounsCollection = getTokens("/opennlp/cfg/wn/noun.txt"); + verbsCollection = getTokens("/opennlp/cfg/wn/verb.txt"); + } else { + adverbsCollection = getTokens("/opennlp/cfg/an/adv.txt"); + adjectivesCollection = getTokens("/opennlp/cfg/an/adj.txt"); + nounsCollection = getTokens("/opennlp/cfg/an/noun.txt"); + verbsCollection = getTokens("/opennlp/cfg/an/verb.txt"); + } + + Collection<String> terminals = new LinkedList<>(); + terminals.addAll(adverbsCollection); + terminals.addAll(verbsCollection); + terminals.addAll(adjectivesCollection); + terminals.addAll(nounsCollection); + + builder.withTerminals(terminals); + + Collection<String> nonTerminals = new LinkedList<>(); + String startSymbol = "START_SYMBOL"; + nonTerminals.add(startSymbol); + nonTerminals.add("NP"); + nonTerminals.add("NN"); + nonTerminals.add("Adv"); + nonTerminals.add("Adj"); + nonTerminals.add("VP"); + nonTerminals.add("Vb"); + builder.withNonTerminals(nonTerminals); + + builder.withStartSymbol(startSymbol); + + Collection<Rule> rules = new LinkedList<Rule>(); + rules.add(new Rule(startSymbol, "VP", "NP")); + rules.add(new Rule("VP", "Adv", "Vb")); + rules.add(new Rule("NP", "Adj", "NN")); + + for (String v : verbsCollection) { + rules.add(new Rule("Vb", v)); + } + for (String adj : adjectivesCollection) { + rules.add(new Rule("Adj", adj)); + } + for (String n : nounsCollection) { + rules.add(new Rule("NN", n)); + } + for (String adv : adverbsCollection) { + rules.add(new Rule("Adv", adv)); + } + builder.withRules(rules); + ContextFreeGrammar cfg = builder.withRandomExpansion(true).build(); + String[] sentence = cfg.leftMostDerivation(startSymbol); + String toString = Arrays.toString(sentence); + + if (toString.length() > 0) { + System.out.println(toString.substring(1, toString.length() - 1).replaceAll(",", "")); + } - Collection<String> terminals = new LinkedList<>(); - terminals.addAll(adverbsCollection); - terminals.addAll(verbsCollection); - terminals.addAll(adjectivesCollection); - terminals.addAll(nounsCollection); - - builder.withTerminals(terminals); - - Collection<String> nonTerminals = new LinkedList<String>(); - String startSymbol = "START_SYMBOL"; - nonTerminals.add(startSymbol); - nonTerminals.add("NP"); - nonTerminals.add("NN"); - nonTerminals.add("Adv"); - nonTerminals.add("Adj"); - nonTerminals.add("VP"); - nonTerminals.add("Vb"); - builder.withNonTerminals(nonTerminals); - - builder.withStartSymbol(startSymbol); - - Collection<Rule> rules = new LinkedList<Rule>(); - rules.add(new Rule(startSymbol, "VP", "NP")); - rules.add(new Rule("VP", "Adv", "Vb")); - rules.add(new Rule("NP", "Adj", "NN")); - - for (String v : verbsCollection) { - rules.add(new Rule("Vb", v)); + boolean pt = Arrays.binarySearch(args, "-pt") >= 0; + + if (pt) { + Map<Rule, Double> rulesMap = new HashMap<>(); + rulesMap.put(new Rule(startSymbol, "VP", "NP"), 1d); + rulesMap.put(new Rule("VP", "Adv", "Vb"), 1d); + rulesMap.put(new Rule("NP", "Adj", "NN"), 1d); + + SecureRandom secureRandom = new SecureRandom(); + + double remainingP = 1d; + for (String v : verbsCollection) { + double p = (double) secureRandom.nextInt(1000) / 1001d; + if (rulesMap.size() == verbsCollection.size() - 1) { + p = remainingP; } - for (String adj : adjectivesCollection) { - rules.add(new Rule("Adj", adj)); + if (remainingP - p <= 0) { + p /= 10; } - for (String n : nounsCollection) { - rules.add(new Rule("NN", n)); + rulesMap.put(new Rule("Vb", v), p); + remainingP -= p; + } + for (String a : adjectivesCollection) { + double p = (double) secureRandom.nextInt(1000) / 1001d; + if (rulesMap.size() == adjectivesCollection.size() - 1) { + p = remainingP; } - for (String adv : adverbsCollection) { - rules.add(new Rule("Adv", adv)); + if (remainingP - p <= 0) { + p /= 10; } - builder.withRules(rules); - ContextFreeGrammar cfg = builder.withRandomExpansion(true).build(); - String[] sentence = cfg.leftMostDerivation(startSymbol); - String toString = Arrays.toString(sentence); - - if (toString.length() > 0) { - System.out.println(toString.substring(1, toString.length() - 1).replaceAll(",", "")); + rulesMap.put(new Rule("Adj", a), p); + remainingP -= p; + } + for (String n : nounsCollection) { + double p = (double) secureRandom.nextInt(1000) / 1001d; + if (rulesMap.size() == nounsCollection.size() - 1) { + p = remainingP; + } else if (remainingP - p <= 0) { + p /= 10; } - - boolean pt = Arrays.binarySearch(args, "-pt") >= 0; - - if (pt) { - Map<Rule, Double> rulesMap = new HashMap<>(); - rulesMap.put(new Rule(startSymbol, "VP", "NP"), 1d); - rulesMap.put(new Rule("VP", "Adv", "Vb"), 1d); - rulesMap.put(new Rule("NP", "Adj", "NN"), 1d); - - SecureRandom secureRandom = new SecureRandom(); - - double remainingP = 1d; - for (String v : verbsCollection) { - double p = (double) secureRandom.nextInt(1000) / 1001d; - if (rulesMap.size() == verbsCollection.size() - 1) { - p = remainingP; - } - if (remainingP - p <= 0) { - p /= 10; - } - rulesMap.put(new Rule("Vb", v), p); - remainingP -= p; - } - for (String a : adjectivesCollection) { - double p = (double) secureRandom.nextInt(1000) / 1001d; - if (rulesMap.size() == adjectivesCollection.size() - 1) { - p = remainingP; - } - if (remainingP - p <= 0) { - p /= 10; - } - rulesMap.put(new Rule("Adj", a), p); - remainingP -= p; - } - for (String n : nounsCollection) { - double p = (double) secureRandom.nextInt(1000) / 1001d; - if (rulesMap.size() == nounsCollection.size() - 1) { - p = remainingP; - } else if (remainingP - p <= 0) { - p /= 10; - } - rulesMap.put(new Rule("NN", n), p); - remainingP -= p; - } - for (String a : adverbsCollection) { - double p = (double) secureRandom.nextInt(1000) / 1001d; - if (rulesMap.size() == adverbsCollection.size() - 1) { - p = remainingP; - } - if (remainingP - p <= 0) { - p /= 10; - } - rulesMap.put(new Rule("Adv", a), p); - remainingP -= p; - } - ProbabilisticContextFreeGrammar pcfg = new ProbabilisticContextFreeGrammar(cfg.getNonTerminalSymbols(), cfg.getTerminalSymbols(), - rulesMap, startSymbol, true); - ProbabilisticContextFreeGrammar.ParseTree parseTree = pcfg.cky(Arrays.asList(sentence)); - System.out.println(parseTree); + rulesMap.put(new Rule("NN", n), p); + remainingP -= p; + } + for (String a : adverbsCollection) { + double p = (double) secureRandom.nextInt(1000) / 1001d; + if (rulesMap.size() == adverbsCollection.size() - 1) { + p = remainingP; } - } - - private static Collection<String> getTokens(String s) throws IOException { - Collection<String> tokens = new LinkedList<>(); - InputStream resourceStream = CFGRunner.class.getResourceAsStream(s); - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(resourceStream)); - String line; - while ((line = bufferedReader.readLine()) != null) { - tokens.add(line); + if (remainingP - p <= 0) { + p /= 10; } - bufferedReader.close(); - resourceStream.close(); - return tokens; + rulesMap.put(new Rule("Adv", a), p); + remainingP -= p; + } + ProbabilisticContextFreeGrammar pcfg = new ProbabilisticContextFreeGrammar(cfg.getNonTerminalSymbols(), cfg.getTerminalSymbols(), + rulesMap, startSymbol, true); + ProbabilisticContextFreeGrammar.ParseTree parseTree = pcfg.cky(Arrays.asList(sentence)); + System.out.println(parseTree); + } + } + + private static Collection<String> getTokens(String s) throws IOException { + Collection<String> tokens = new LinkedList<>(); + try (BufferedReader bufferedReader = new BufferedReader( + new InputStreamReader(CFGRunner.class.getResourceAsStream(s)))) { + String line; + while ((line = bufferedReader.readLine()) != null) { + tokens.add(line); + } } + return tokens; + } } diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java index c3419ed..e3e4826 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ContextFreeGrammar.java @@ -65,7 +65,7 @@ public class ContextFreeGrammar { } public String[] leftMostDerivation(String... words) { - ArrayList<String> expansion = new ArrayList<String>(words.length); + ArrayList<String> expansion = new ArrayList<>(words.length); assert words.length > 0 && startSymbol.equals(words[0]); @@ -78,13 +78,13 @@ public class ContextFreeGrammar { private Collection<String> getTerminals(String word) { if (terminalSymbols.contains(word)) { - Collection<String> c = new LinkedList<String>(); + Collection<String> c = new LinkedList<>(); c.add(word); return c; } else { assert nonTerminalSymbols.contains(word) : "word " + word + " is not contained in non terminals"; String[] expansions = getExpansionForSymbol(word); - Collection<String> c = new LinkedList<String>(); + Collection<String> c = new LinkedList<>(); for (String e : expansions) { c.addAll(getTerminals(e)); } @@ -98,7 +98,7 @@ public class ContextFreeGrammar { } private Rule getRuleForSymbol(String word) { - ArrayList<Rule> possibleRules = new ArrayList<Rule>(); + ArrayList<Rule> possibleRules = new ArrayList<>(); for (Rule r : rules) { if (word.equals(r.getEntry())) { if (!randomExpansion) { diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java index f5d936c..edd6b37 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java @@ -39,7 +39,7 @@ public class ProbabilisticContextFreeGrammar { private final Collection<String> terminalSymbols; private final Map<Rule, Double> rules; private final String startSymbol; - private boolean randomExpansion; + private final boolean randomExpansion; private static final Rule emptyRule = new Rule("EMPTY~", ""); @@ -86,7 +86,7 @@ public class ProbabilisticContextFreeGrammar { public String[] leftMostDerivation(String... words) { - ArrayList<String> expansion = new ArrayList<String>(words.length); + ArrayList<String> expansion = new ArrayList<>(words.length); assert words.length > 0 && startSymbol.equals(words[0]); @@ -98,13 +98,13 @@ public class ProbabilisticContextFreeGrammar { private Collection<String> getTerminals(String word) { if (terminalSymbols.contains(word)) { - Collection<String> c = new LinkedList<String>(); + Collection<String> c = new LinkedList<>(); c.add(word); return c; } else { assert nonTerminalSymbols.contains(word) : "word " + word + " is not contained in non terminals"; String[] expansions = getExpansionForSymbol(word); - Collection<String> c = new LinkedList<String>(); + Collection<String> c = new LinkedList<>(); for (String e : expansions) { c.addAll(getTerminals(e)); } @@ -118,7 +118,7 @@ public class ProbabilisticContextFreeGrammar { } private Rule getRuleForSymbol(String word) { - ArrayList<Rule> possibleRules = new ArrayList<Rule>(); + ArrayList<Rule> possibleRules = new ArrayList<>(); for (Rule r : rules.keySet()) { if (word.equals(r.getEntry())) { if (!randomExpansion) { @@ -186,7 +186,7 @@ public class ProbabilisticContextFreeGrammar { } private Collection<Rule> getRulesForNonTerminal(String x) { - LinkedList<Rule> ntRules = new LinkedList<Rule>(); + LinkedList<Rule> ntRules = new LinkedList<>(); for (Rule r : rules.keySet()) { String[] expansion = r.getExpansion(); if (expansion.length == 2 && x.equals(r.getEntry()) && nonTerminalSymbols.contains(expansion[0]) && nonTerminalSymbols.contains(expansion[1])) { @@ -197,7 +197,7 @@ public class ProbabilisticContextFreeGrammar { } private Collection<Rule> getNTRules() { - Collection<Rule> ntRules = new LinkedList<Rule>(); + Collection<Rule> ntRules = new LinkedList<>(); for (Rule r : rules.keySet()) { String[] expansion = r.getExpansion(); if (expansion.length == 2 && nonTerminalSymbols.contains(expansion[0]) && nonTerminalSymbols.contains(expansion[1])) { @@ -211,7 +211,7 @@ public class ProbabilisticContextFreeGrammar { return rules.keySet().contains(rule) ? rules.get(rule) : 0; } - public class ParseTree { + public static class ParseTree { private final double probability; private final int splitPoint; @@ -281,11 +281,11 @@ public class ProbabilisticContextFreeGrammar { } public static ProbabilisticContextFreeGrammar parseGrammar(boolean trim, String... parseTreeStrings) { - return parseGrammar(new HashMap<Rule, Double>(), "S", trim, parseTreeStrings); + return parseGrammar(new HashMap<>(), "S", trim, parseTreeStrings); } public static ProbabilisticContextFreeGrammar parseGrammar(String... parseTreeStrings) { - return parseGrammar(new HashMap<Rule, Double>(), "S", true, parseTreeStrings); + return parseGrammar(new HashMap<>(), "S", true, parseTreeStrings); } public static ProbabilisticContextFreeGrammar parseGrammar(Map<Rule, Double> rulesMap, String startSymbol, boolean trim, String... parseStrings) { @@ -303,7 +303,7 @@ public class ProbabilisticContextFreeGrammar { for (String parseTreeString : parseStrings) { if (trim) { - parseTreeString = parseTreeString.replaceAll("\n", "").replaceAll("\t", "").replaceAll("\\s+", " "); + parseTreeString = parseTreeString.replace("\n", "").replace("\t", "").replaceAll("\\s+", " "); } String toConsume = String.valueOf(parseTreeString); diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java index b7bf33c..1114677 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifier.java @@ -25,16 +25,17 @@ import java.util.LinkedList; import java.util.Map; /** - * C = argmax( P(d|c) * P(c) ) - * where P(d|c) is called: likelihood - * and P(c) is called: prior - we can count relative frequencies in a corpus - * and d is a vector of features - * <p/> - * we assume: - * 1. bag of words assumption: positions don't matter - * 2. conditional independence: the feature probabilities are independent given a class - * <p/> - * thus P(d|c) == P(x1,..,xn|c) == P(x1|c)*...P(xn|c) + * {@code C = argmax( P(d|c) * P(c) )} + * where {@code P(d|c)} is called: likelihood + * and {@code P(c)} is called: prior - we can count relative frequencies in a corpus + * and {@code d} is a vector of features. + * <p> + * We assume: + * <ol> + * <li>bag of words assumption: positions don't matter</li> + * <li>conditional independence: the feature probabilities are independent given a class</li> + * </ol> + * thus {@code P(d|c) == P(x1,..,xn|c) == P(x1|c)*...P(xn|c)} */ public class SimpleNaiveBayesClassifier implements NaiveBayesClassifier<String, String> { @@ -104,10 +105,10 @@ public class SimpleNaiveBayesClassifier implements NaiveBayesClassifier<String, @Override public String calculateClass(String inputDocument) { - Double max = 0d; + double max = 0d; String foundClass = null; for (String cl : classMegaDocMap.keySet()) { - Double clVal = priors.get(cl) * calculateLikelihood(inputDocument, cl); + double clVal = priors.get(cl) * calculateLikelihood(inputDocument, cl); if (clVal > max) { max = clVal; foundClass = cl; @@ -120,7 +121,7 @@ public class SimpleNaiveBayesClassifier implements NaiveBayesClassifier<String, private Double calculateLikelihood(String document, String c) { String megaDoc = classMegaDocMap.get(c); // for each word - Double result = 1d; + double result = 1d; for (String word : tokenizeDoc(document)) { // num : count the no of times the word appears in documents of class c (+1) double num = count(word, megaDoc) + 1; // +1 is added because of add 1 smoothing diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java index dab6f60..a3c064a 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/classification/UpdatableSimpleNaiveBayesClassifier.java @@ -25,22 +25,21 @@ import java.util.List; import java.util.Map; import java.util.TreeSet; - public class UpdatableSimpleNaiveBayesClassifier implements NaiveBayesClassifier<List<String>, String> { - private final Collection<String> vocabulary = new TreeSet<String>(); // the bag of all the words in the corpus - private final Map<String, Integer> classCounts = new LinkedHashMap<String, Integer>(); + private final Collection<String> vocabulary = new TreeSet<>(); // the bag of all the words in the corpus + private final Map<String, Integer> classCounts = new LinkedHashMap<>(); private double noDocs = 0d; - private final Map<String, Map<String, Integer>> nm = new HashMap<String, Map<String, Integer>>(); - private final Map<String, Double> priors = new HashMap<String, Double>(); - private final Map<String, Double> dens = new HashMap<String, Double>(); + private final Map<String, Map<String, Integer>> nm = new HashMap<>(); + private final Map<String, Double> priors = new HashMap<>(); + private final Map<String, Double> dens = new HashMap<>(); public void addExample(String klass, List<String> words) { vocabulary.addAll(words); Integer integer = classCounts.get(klass); - Integer f = integer != null ? integer : 0; + int f = integer != null ? integer : 0; classCounts.put(klass, f + 1); noDocs++; @@ -48,7 +47,7 @@ public class UpdatableSimpleNaiveBayesClassifier implements NaiveBayesClassifier for (String w : words) { Map<String, Integer> wordCountsForClass = nm.get(klass); if (wordCountsForClass == null) { - wordCountsForClass = new HashMap<String, Integer>(); + wordCountsForClass = new HashMap<>(); } Integer count = wordCountsForClass.get(w); if (count == null) { @@ -69,7 +68,7 @@ public class UpdatableSimpleNaiveBayesClassifier implements NaiveBayesClassifier private void calculateDen(String c) { // den : for the whole dictionary, count the no of times a word appears in documents of class c (+|V|) - Double den = 0d; + double den = 0d; for (String w : vocabulary) { Integer integer = nm.get(c).get(w); den += integer != null ? integer : 0; @@ -78,8 +77,9 @@ public class UpdatableSimpleNaiveBayesClassifier implements NaiveBayesClassifier dens.put(c, den); } - public String calculateClass(List<String> words) throws Exception { - Double max = -1000000d; + @Override + public String calculateClass(List<String> words) { + double max = -1000000d; String foundClass = null; for (String cl : nm.keySet()) { double prior = priors.get(cl); @@ -100,7 +100,7 @@ public class UpdatableSimpleNaiveBayesClassifier implements NaiveBayesClassifier double result = 0d; for (String word : words) { // num : count the no of times the word appears in documents of class c (+1) - Integer freq = wordFreqs.get(word) != null ? wordFreqs.get(word) : 0; + int freq = wordFreqs.get(word) != null ? wordFreqs.get(word) : 0; double num = freq + 1d; // +1 is added because of add 1 smoothing // P(w|c) = num/den diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java index e84f48c..7b56847 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/LanguageModel.java @@ -22,18 +22,18 @@ import java.util.Collection; /** * A language model can calculate the probability <i>p</i> (between 0 and 1) of a - * certain set of <code>T</code> objects, given a vocabulary. - * <code>T</code> is usually an {@link java.lang.Iterable} or an array as language models are very commonly used for - * sentences, so that T is e.g. an array of <code>String</code>s. + * certain set of {@code T} objects, given a vocabulary. + * {@code T} is usually an {@link java.lang.Iterable} or an array as language models are very commonly used for + * sentences, so that T is e.g. an array of {@code String}s. */ public interface LanguageModel<T> { /** - * Calculate the probability of a sample, given a vocabulary + * Calculates the probability of a sample, given a {@code vocabulary}, * - * @param vocabulary a {@link Collection} of objects of type <code>T</code> + * @param vocabulary a {@link Collection} of objects of type {@code T} * @param sample the sample to evaluate the probability for - * @return a <code>double</code> between <code>0</code> and <code>1</code> + * @return a {@code double} between {@code 0} and {@code 1} */ double calculateProbability(Collection<T> vocabulary, T sample); diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java index abadc23..d29b2e5 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModel.java @@ -22,8 +22,8 @@ import java.util.Collection; import java.util.Collections; /** - * Simple sentence language model which just counts the no. of occurrences of - * a sentence over the no. of sentences in the vocabulary. + * Simple sentence language model which just counts the occurrences of + * a sentence over the number of sentences in the vocabulary. */ public class NaiveSentenceLanguageModel<T> implements LanguageModel<T[]> { diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java index fd2b81d..9085605 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/NoisyChannel.java @@ -30,10 +30,10 @@ public abstract class NoisyChannel { } public String findCorrection(String mispelledWord) { - Double val = 0d; + double val = 0d; String correctWord = null; for (String word : dictionary) { - Double curVal = calculateLikelihood(mispelledWord, word) * calculatePrior(word); + double curVal = calculateLikelihood(mispelledWord, word) * calculatePrior(word); if (curVal > val) { val = curVal; correctWord = word; diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java index a916cd3..b51856e 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModel.java @@ -54,7 +54,7 @@ public class TrigramSentenceLanguageModel<T> implements LanguageModel<T[]> { } private Set<Trigram> getTrigrams(T[] sample) { - Set<Trigram> trigrams = new HashSet<Trigram>(); + Set<Trigram> trigrams = new HashSet<>(); for (int i = 0; i < sample.length; i++) { T x0 = null; T x1 = null; diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java index a7371bc..00eccb6 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/ngram/NGramUtils.java @@ -23,7 +23,7 @@ import java.util.Collection; import java.util.HashSet; /** - * utility class for calculating probabilities of tri/bi/uni-grams + * A utility class for calculating probabilities of tri/bi/uni-grams. */ public class NGramUtils { @@ -123,8 +123,8 @@ public class NGramUtils { } public static <T> Double calculateMissingBigramProbabilityMass(T x1, Double discount, Collection<T[]> set) { - Double missingMass = 0d; - Double countWord = count(x1, set); + double missingMass = 0d; + double countWord = count(x1, set); for (T word : flatSet(set)) { missingMass += (count(word, x1, set) - discount) / countWord; } diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java index 009e72b..67613b5 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/GradientDescentUtils.java @@ -23,7 +23,7 @@ import java.util.Random; import org.apache.opennlp.utils.TrainingSet; /** - * Utility class for calculating gradient descent + * A utility class for calculating gradient descent. */ public class GradientDescentUtils { diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java index 4642fbd..711ae43 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/Hypothesis.java @@ -24,7 +24,7 @@ package org.apache.opennlp.utils.regression; public interface Hypothesis { /** - * calculate the output given some inputs according to the underlying model. + * Calculates the output given some inputs according to the underlying model. * * @param inputs an array of inputs as <code>double</code> * @return a <code>double</code> representing the output diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java index 45efba8..6150a37 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/LinearCombinationHypothesis.java @@ -19,7 +19,7 @@ package org.apache.opennlp.utils.regression; /** - * Simplest {@link Hypothesis} which just linearly combines inputs with weights + * Simplest {@link Hypothesis} which just linearly combines inputs with weights. */ public class LinearCombinationHypothesis implements Hypothesis { private final double[] weights; diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java index d543f51..7b606a2 100644 --- a/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java +++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/regression/RegressionModelUtils.java @@ -22,16 +22,16 @@ import org.apache.opennlp.utils.TrainingExample; import org.apache.opennlp.utils.TrainingSet; /** - * Utility class for calculating various regression models costs + * A utility class for calculating various regression models costs. */ public class RegressionModelUtils { /** - * calculate the ordinary least squares (OLS) cost in the given training set for a given hypothesis + * Calculates the ordinary least squares (OLS) cost in the given training set for a given hypothesis. * - * @param trainingSet the training set used - * @param hypothesis the hypothesis function representing the model - * @return the cost of the hypothesis for the given training set using OLS + * @param trainingSet The {@link TrainingSet} used. + * @param hypothesis The {@link Hypothesis} function representing the model. + * @return The cost of the hypothesis for the given training set using OLS. */ public static double ordinaryLeastSquares(TrainingSet trainingSet, Hypothesis hypothesis) { double output = 0; @@ -43,12 +43,12 @@ public class RegressionModelUtils { } /** - * calculate the least mean square (LMS) update for a given weight vector + * Calculates the least mean square (LMS) update for a given weight vector. * - * @param thetas the array of weights - * @param alpha the learning rate alpha - * @param trainingSet the training set to use for learning - * @param hypothesis the hypothesis representing the model + * @param thetas The array of weights. + * @param alpha The learning rate alpha. + * @param trainingSet The {@link TrainingSet} to use for learning. + * @param hypothesis The {@link Hypothesis} representing the model. * @return the updated weights vector */ public static double[] batchLeastMeanSquareUpdate(double[] thetas, double alpha, TrainingSet trainingSet, Hypothesis hypothesis) { @@ -66,12 +66,12 @@ public class RegressionModelUtils { /** * calculate least mean square update for a given training example for the j-th input * - * @param thetas the array of weights - * @param alpha the learning rate alpha - * @param trainingExample the training example to use for learning - * @param hypothesis the hypothesis representing the model - * @param j the index of the j-th input - * @return the updated weight for the j-th element of the weights vector + * @param thetas The array of weights. + * @param alpha The learning rate alpha. + * @param trainingExample The {@link TrainingExample} to use for learning. + * @param hypothesis The {@link Hypothesis} representing the model. + * @param j The index of the j-th input. + * @return The updated weight for the j-th element of the weights vector. */ public static double singleLeastMeanSquareUpdate(double[] thetas, double alpha, TrainingExample trainingExample, Hypothesis hypothesis, int j) { return thetas[j] + alpha * (trainingExample.getOutput() - hypothesis.calculateOutput(trainingExample.getInputs())) * trainingExample.getInputs()[j]; diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java index 5d4b84f..32d318f 100644 --- a/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java +++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/TestUtils.java @@ -29,7 +29,7 @@ import org.junit.Ignore; @Ignore public class TestUtils { - private static Random r = new Random(); + private static final Random R = new Random(); public static void fillTrainingSet(TrainingSet trainingSet, int size, int dimension) { for (int i = 0; i < size; i++) { @@ -43,7 +43,7 @@ public class TestUtils { } public static Collection<String[]> generateRandomVocabulary() { - int size = r.nextInt(1000); + int size = R.nextInt(1000); Collection<String[]> vocabulary = new ArrayList<String[]>(size); for (int i = 0; i < size; i++) { String[] sentence = generateRandomSentence(); @@ -53,10 +53,10 @@ public class TestUtils { } public static String[] generateRandomSentence() { - int dimension = r.nextInt(10); + int dimension = R.nextInt(10); String[] sentence = new String[dimension]; for (int j = 0; j < dimension; j++) { - char c = (char) r.nextInt(10); + char c = (char) R.nextInt(10); sentence[j] = c + "-" + c + "-" + c; } return sentence; diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java index 7a84c2f..d5f2560 100644 --- a/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java +++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/anomalydetection/AnomalyDetectionUtilsTest.java @@ -25,7 +25,6 @@ import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; /** * Testcase for {@link org.apache.opennlp.utils.anomalydetection.AnomalyDetectionUtils} @@ -33,7 +32,7 @@ import static org.junit.Assert.assertTrue; public class AnomalyDetectionUtilsTest { @Test - public void testGaussianDistributionProbabilityFromFitParameters() throws Exception { + public void testGaussianDistributionProbabilityFromFitParameters() { TrainingSet trainingSet = new TrainingSet(); TestUtils.fillTrainingSet(trainingSet, 100, 5); double[] mus = AnomalyDetectionUtils.fitMus(trainingSet); @@ -46,7 +45,7 @@ public class AnomalyDetectionUtilsTest { } @Test - public void testGaussianDistributionProbabilityFromTrainingSet() throws Exception { + public void testGaussianDistributionProbabilityFromTrainingSet() { TrainingSet trainingSet = new TrainingSet(); TestUtils.fillTrainingSet(trainingSet, 100, 5); TrainingExample newInput = new TrainingExample(new double[]{0.4d,0.5d,0.5d,0.5d,0.2d}, 0d); diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java index 8016679..8d7f408 100644 --- a/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java +++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/classification/SimpleNaiveBayesClassifierTest.java @@ -31,8 +31,8 @@ import static org.junit.Assert.assertTrue; public class SimpleNaiveBayesClassifierTest { @Test - public void ppsIntegrationTest() throws Exception { - Map<String, String> trainedCorpus = new HashMap<String, String>(); + public void ppsIntegrationTest() { + Map<String, String> trainedCorpus = new HashMap<>(); trainedCorpus.put("CAVOUR ad.te napoleone III affare: cat. C/2 ottimo" + " stato ingresso angolo cottura bagno con doccia e camera. " + "ottimo per investimento o piccolo studio per professionisti" + @@ -69,7 +69,7 @@ public class SimpleNaiveBayesClassifierTest { SimpleNaiveBayesClassifier classifier = new SimpleNaiveBayesClassifier(trainedCorpus); - Boolean isAgency = classifier.calculateClass("CENTRO S.Maria Maggiore " + + boolean isAgency = classifier.calculateClass("CENTRO S.Maria Maggiore " + "angolo Napoleone III in palazzo epoca con portiere 110 mq ristrutt." + " IIp salone doppio cucina ab. 2 camere bagno ripost. balcone " + "perimetrale E. 730.000 tratt. ").equals("A"); diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java index 10e0fac..c4d0825 100644 --- a/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java +++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/NaiveSentenceLanguageModelTest.java @@ -31,17 +31,17 @@ import static org.junit.Assert.assertTrue; public class NaiveSentenceLanguageModelTest { @Test - public void testEmptyVocabularyProbability() throws Exception { - NaiveSentenceLanguageModel<String> model = new NaiveSentenceLanguageModel<String>(); - assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(), - new String[0]), 0d); - assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(), - new String[]{"1", "2", "3"}), 0d); + public void testEmptyVocabularyProbability() { + NaiveSentenceLanguageModel<String> model = new NaiveSentenceLanguageModel<>(); + assertEquals("probability with an empty vocabulary is always 0", + 0d, model.calculateProbability(Collections.emptySet(), new String[0]), 0d); + assertEquals("probability with an empty vocabulary is always 0", + 0d, model.calculateProbability(Collections.emptySet(), new String[]{"1", "2", "3"}), 0d); } @Test - public void testRandomVocabularyAndSentence() throws Exception { - NaiveSentenceLanguageModel<String> model = new NaiveSentenceLanguageModel<String>(); + public void testRandomVocabularyAndSentence() { + NaiveSentenceLanguageModel<String> model = new NaiveSentenceLanguageModel<>(); double probability = model.calculateProbability(TestUtils.generateRandomVocabulary(), TestUtils.generateRandomSentence()); assertTrue("a probability measure should be between 0 and 1 [was " + probability + "]", probability >= 0 && probability <= 1); } diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java index b2d6d51..b716c26 100644 --- a/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java +++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/languagemodel/TrigramSentenceLanguageModelTest.java @@ -31,17 +31,17 @@ import static org.junit.Assert.assertTrue; public class TrigramSentenceLanguageModelTest { @Test - public void testEmptyVocabularyProbability() throws Exception { - TrigramSentenceLanguageModel<String> model = new TrigramSentenceLanguageModel<String>(); - assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(), - new String[0]), 0d); - assertEquals("probability with an empty vocabulary is always 0", 0d, model.calculateProbability(Collections.<String[]>emptySet(), - new String[]{"1", "2", "3"}), 0d); + public void testEmptyVocabularyProbability() { + TrigramSentenceLanguageModel<String> model = new TrigramSentenceLanguageModel<>(); + assertEquals("probability with an empty vocabulary is always 0", + 0d, model.calculateProbability(Collections.emptySet(), new String[0]), 0d); + assertEquals("probability with an empty vocabulary is always 0", + 0d, model.calculateProbability(Collections.emptySet(), new String[]{"1", "2", "3"}), 0d); } @Test - public void testRandomVocabularyAndSentence() throws Exception { - TrigramSentenceLanguageModel<String> model = new TrigramSentenceLanguageModel<String>(); + public void testRandomVocabularyAndSentence() { + TrigramSentenceLanguageModel<String> model = new TrigramSentenceLanguageModel<>(); double probability = model.calculateProbability(TestUtils.generateRandomVocabulary(), TestUtils.generateRandomSentence()); assertTrue("a probability measure should be between 0 and 1 [was " + probability + "]", probability >= 0 && probability <= 1); } diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java index 8da4947..a988e98 100644 --- a/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java +++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/ngram/NGramUtilsTest.java @@ -32,7 +32,7 @@ import static org.junit.Assert.assertTrue; public class NGramUtilsTest { @Test public void testBigram() { - Collection<String[]> set = new LinkedList<String[]>(); + Collection<String[]> set = new LinkedList<>(); set.add(new String[]{"<s>", "I", "am", "Sam", "</s>"}); set.add(new String[]{"<s>", "Sam", "I", "am", "</s>"}); set.add(new String[]{"<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"}); @@ -48,7 +48,7 @@ public class NGramUtilsTest { @Test public void testTrigram() { - Collection<String[]> set = new LinkedList<String[]>(); + Collection<String[]> set = new LinkedList<>(); set.add(new String[]{"<s>", "I", "am", "Sam", "</s>"}); set.add(new String[]{"<s>", "Sam", "I", "am", "</s>"}); set.add(new String[]{"<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"}); @@ -60,8 +60,8 @@ public class NGramUtilsTest { } @Test - public void testLinearInterpolation() throws Exception { - Collection<String[]> set = new LinkedList<String[]>(); + public void testLinearInterpolation() { + Collection<String[]> set = new LinkedList<>(); set.add(new String[]{"the", "green", "book", "STOP"}); set.add(new String[]{"my", "blue", "book", "STOP"}); set.add(new String[]{"his", "green", "house", "STOP"}); @@ -74,8 +74,8 @@ public class NGramUtilsTest { } @Test - public void testLinearInterpolation2() throws Exception { - Collection<String[]> set = new LinkedList<String[]>(); + public void testLinearInterpolation2() { + Collection<String[]> set = new LinkedList<>(); set.add(new String[]{"D", "N", "V", "STOP"}); set.add(new String[]{"D", "N", "V", "STOP"}); Double lambda = 1d / 3d; diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java index 91e7370..4486120 100644 --- a/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java +++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/GradientDescentUtilsTest.java @@ -28,7 +28,7 @@ import org.junit.Test; public class GradientDescentUtilsTest { @Test - public void testConvergence() throws Exception { + public void testConvergence() { TrainingSet trainingSet = new TrainingSet(); TestUtils.fillTrainingSet(trainingSet, 100, 5); GradientDescentUtils.batchGradientDescent(trainingSet, 0.00002); diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java index 4c46a9b..8676989 100644 --- a/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java +++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/regression/RegressionModelUtilsTest.java @@ -22,8 +22,8 @@ import org.apache.opennlp.utils.TrainingExample; import org.apache.opennlp.utils.TrainingSet; import org.junit.Test; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; /** * Testcase for {@link org.apache.opennlp.utils.regression.RegressionModelUtils} @@ -31,14 +31,15 @@ import static org.junit.Assert.assertTrue; public class RegressionModelUtilsTest { @Test - public void testLMS() throws Exception { + public void testLMS() { TrainingSet trainingSet = new TrainingSet(); trainingSet.add(new TrainingExample(new double[]{10, 10}, 1)); LinearCombinationHypothesis hypothesis = new LinearCombinationHypothesis(1, 1); - double[] updatedParameters = RegressionModelUtils.batchLeastMeanSquareUpdate(new double[]{1, 1}, 0.1, trainingSet, hypothesis); + double[] updatedParameters = RegressionModelUtils.batchLeastMeanSquareUpdate( + new double[]{1, 1}, 0.1, trainingSet, hypothesis); assertNotNull(updatedParameters); - assertTrue(updatedParameters.length == 2); - assertTrue(updatedParameters[0] == -18d); - assertTrue(updatedParameters[1] == -18d); + assertEquals(2, updatedParameters.length); + assertEquals(-18d, updatedParameters[0], 0.0); + assertEquals(-18d, updatedParameters[1], 0.0); } }
