svn commit: r1729193 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/languagemodel/ main/java/opennlp/tools/ngram/ test/java/opennlp/tools/languagemodel/ test/java/opennlp/tools/ngram/ test/resources/opennlp/tools/languagemodel/

tommaso Mon, 08 Feb 2016 08:11:11 -0800

Author: tommaso
Date: Mon Feb  8 15:52:08 2016
New Revision: 1729193

URL: http://svn.apache.org/viewvc?rev=1729193&view=rev
Log:
OPENNLP-659 - added support for language models


Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
   (with props)
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
   (with props)
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java   
(with props)
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/
    
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java
   (with props)
    
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java
   (with props)
    
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java
   (with props)
    
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramUtilsTest.java
   (with props)
    opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/
    
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
   (with props)
Modified:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java?rev=1729193&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
 Mon Feb  8 15:52:08 2016
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package opennlp.tools.languagemodel;
+
+import opennlp.tools.util.StringList;
+
+/**
+ * A language model can calculate the probability <i>p</i> (between 0 and 1) 
of a
+ * certain {@link opennlp.tools.util.StringList sequence of tokens}, given its 
underlying vocabulary.
+ */
+public interface LanguageModel {
+
+  /**
+   * Calculate the probability of a series of tokens (e.g. a sentence), given 
a vocabulary
+   *
+   * @param tokens the text tokens to calculate the probability for
+   * @return the probability of the given text tokens in the vocabulary
+   */
+  double calculateProbability(StringList tokens);
+
+  /**
+   * Predict the most probable output sequence of tokens, given an input 
sequence of tokens
+   *
+   * @param tokens a sequence of tokens
+   * @return the most probable subsequent token sequence
+   */
+  StringList predictNextTokens(StringList tokens);
+
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java?rev=1729193&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
 Mon Feb  8 15:52:08 2016
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package opennlp.tools.languagemodel;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.ngram.NGramUtils;
+import opennlp.tools.util.StringList;
+
+/**
+ * AÂ {@link opennlp.tools.languagemodel.LanguageModel} based on a {@link 
opennlp.tools.ngram.NGramModel} using Laplace
+ * smoothing probability estimation to get the probabilities of the ngrams.
+ * See also {@link 
NGramUtils#calculateLaplaceSmoothingProbability(opennlp.tools.util.StringList, 
Iterable, int, Double)}.
+ */
+public class NGramLanguageModel extends NGramModel implements LanguageModel {
+
+  private static final int DEFAULT_N = 3;
+  private static final double DEFAULT_K = 1d;
+
+  private final int n;
+  private final double k;
+
+  public NGramLanguageModel() {
+    this(DEFAULT_N, DEFAULT_K);
+  }
+
+  public NGramLanguageModel(int n) {
+    this(n, DEFAULT_K);
+  }
+
+  public NGramLanguageModel(double k) {
+    this(DEFAULT_N, k);
+  }
+
+  public NGramLanguageModel(int n, double k) {
+    this.n = n;
+    this.k = k;
+  }
+
+  public NGramLanguageModel(InputStream in) throws IOException {
+    this(in, DEFAULT_N, DEFAULT_K);
+  }
+
+  public NGramLanguageModel(InputStream in, double k) throws IOException {
+    this(in, DEFAULT_N, k);
+  }
+
+  public NGramLanguageModel(InputStream in, int n) throws IOException {
+    this(in, n, DEFAULT_K);
+  }
+
+  public NGramLanguageModel(InputStream in, int n, double k) throws 
IOException {
+    super(in);
+    this.n = n;
+    this.k = k;
+  }
+
+  @Override
+  public double calculateProbability(StringList sample) {
+    double probability = 0d;
+    if (size() > 0) {
+      for (StringList ngram : NGramUtils.getNGrams(sample, n)) {
+        StringList nMinusOneToken = NGramUtils.getNMinusOneTokenFirst(ngram);
+        if (size() > 1000000) {
+          // use stupid backoff
+          probability += Math.log(getStupidBackoffProbability(ngram, 
nMinusOneToken));
+        } else {
+          // use laplace smoothing
+          probability += Math.log(getLaplaceSmoothingProbability(ngram, 
nMinusOneToken));
+        }
+      }
+      if (Double.isNaN(probability)) {
+        probability = 0d;
+      } else if (probability != 0) {
+        probability = Math.exp(probability);
+      }
+
+    }
+    return probability;
+  }
+
+  @Override
+  public StringList predictNextTokens(StringList tokens) {
+    double maxProb = Double.NEGATIVE_INFINITY;
+    StringList token = null;
+
+    for (StringList ngram : this) {
+      String[] sequence = new String[ngram.size() + tokens.size()];
+      for (int i = 0; i < tokens.size(); i++) {
+        sequence[i] = tokens.getToken(i);
+      }
+      for (int i = 0; i < ngram.size(); i++) {
+        sequence[i + tokens.size()] = ngram.getToken(i);
+      }
+      StringList sample = new StringList(sequence);
+      double v = calculateProbability(sample);
+      if (v > maxProb) {
+        maxProb = v;
+        token = ngram;
+      }
+    }
+
+    return token;
+  }
+
+  private double getLaplaceSmoothingProbability(StringList ngram, StringList 
nMinusOneToken) {
+    return (getCount(ngram) + k) / ((double) getCount(nMinusOneToken) + k * 
size());
+  }
+
+  private double getStupidBackoffProbability(StringList ngram, StringList 
nMinusOneToken) {
+    int count = getCount(ngram);
+    if (nMinusOneToken == null || nMinusOneToken.size() == 0) {
+      return count / size();
+    } else if (count > 0) {
+      return ((double) count) / ((double) getCount(nMinusOneToken)); // 
maximum likelihood probability
+    } else {
+      StringList nextNgram = NGramUtils.getNMinusOneTokenLast(ngram);
+      return 0.4d * getStupidBackoffProbability(nextNgram, 
NGramUtils.getNMinusOneTokenFirst(nextNgram));
+    }
+  }
+
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java?rev=1729193&r1=1729192&r2=1729193&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java 
(original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java 
Mon Feb  8 15:52:08 2016
@@ -55,7 +55,7 @@ public class NGramModel implements Itera
   /**
    * Initializes the current instance.
    *
-   * @param in
+   * @param in the serialized model stream
    * @throws IOException
    * @throws InvalidFormatException
    */
@@ -89,8 +89,7 @@ public class NGramModel implements Itera
   /**
    * Retrieves the count of the given ngram.
    *
-   * @param ngram
-   *
+   * @param ngram an ngram
    * @return count of the ngram or 0 if it is not contained
    *
    */
@@ -129,8 +128,7 @@ public class NGramModel implements Itera
   public void add(StringList ngram) {
     if (contains(ngram)) {
       setCount(ngram, getCount(ngram) + 1);
-    }
-    else {
+    } else {
       mNGrams.put(ngram, 1);
     }
   }
@@ -153,8 +151,7 @@ public class NGramModel implements Itera
         throw new IllegalArgumentException("minLength param must not be larger 
than " +
             "maxLength param. minLength=" + minLength + ", maxLength= " + 
maxLength);
 
-    for (int lengthIndex = minLength; lengthIndex < maxLength + 1;
-    lengthIndex++) {
+    for (int lengthIndex = minLength; lengthIndex < maxLength + 1; 
lengthIndex++) {
       for (int textIndex = 0;
           textIndex + lengthIndex - 1 < ngram.size(); textIndex++) {
 
@@ -178,8 +175,7 @@ public class NGramModel implements Itera
    */
   public void add(String chars, int minLength, int maxLength) {
 
-    for (int lengthIndex = minLength; lengthIndex < maxLength + 1;
-    lengthIndex++) {
+    for (int lengthIndex = minLength; lengthIndex < maxLength + 1; 
lengthIndex++) {
       for (int textIndex = 0;
           textIndex + lengthIndex - 1 < chars.length(); textIndex++) {
 
@@ -255,7 +251,7 @@ public class NGramModel implements Itera
 
     if (cutoffUnder > 0 || cutoffOver < Integer.MAX_VALUE) {
 
-      for (Iterator<StringList> it = iterator(); it.hasNext();) {
+      for (Iterator<StringList> it = iterator(); it.hasNext(); ) {
 
         StringList ngram = it.next();
 

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java?rev=1729193&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java 
(added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java 
Mon Feb  8 15:52:08 2016
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package opennlp.tools.ngram;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.LinkedList;
+
+import opennlp.tools.util.StringList;
+
+/**
+ * Utility class for ngrams.
+ * Some methods apply specifically to certain 'n' values, for e.g. 
tri/bi/uni-grams.
+ */
+public class NGramUtils {
+
+  /**
+   * calculate the probability of a ngram in a vocabulary using Laplace 
smoothing algorithm
+   *
+   * @param ngram the ngram to get the probability for
+   * @param set   the vocabulary
+   * @param size  the size of the vocabulary
+   * @param k     the smoothing factor
+   * @return the Laplace smoothing probability
+   * @see <a href="https://en.wikipedia.org/wiki/Additive_smoothing";>Additive 
Smoothing</a>
+   */
+  public static double calculateLaplaceSmoothingProbability(StringList ngram, 
Iterable<StringList> set, int size, Double k) {
+    return (count(ngram, set) + k) / (count(getNMinusOneTokenFirst(ngram), 
set) + k * 1);
+  }
+
+  /**
+   * calculate the probability of a unigram in a vocabulary using maximum 
likelihood estimation
+   *
+   * @param word the only word in the unigram
+   * @param set  the vocabulary
+   * @return the maximum likelihood probability
+   */
+  public static double calculateUnigramMLProbability(String word, 
Collection<StringList> set) {
+    double vocSize = 0d;
+    for (StringList s : set) {
+      vocSize += s.size();
+    }
+    return count(new StringList(word), set) / vocSize;
+  }
+
+  /**
+   * calculate the probability of a bigram in a vocabulary using maximum 
likelihood estimation
+   *
+   * @param x0  first word in the bigram
+   * @param x1  second word in the bigram
+   * @param set the vocabulary
+   * @return the maximum likelihood probability
+   */
+  public static double calculateBigramMLProbability(String x0, String x1, 
Collection<StringList> set) {
+    return calculateNgramMLProbability(new StringList(x0, x1), set);
+  }
+
+  /**
+   * calculate the probability of a trigram in a vocabulary using maximum 
likelihood estimation
+   *
+   * @param x0  first word in the trigram
+   * @param x1  second word in the trigram
+   * @param x2  third word in the trigram
+   * @param set the vocabulary
+   * @return the maximum likelihood probability
+   */
+  public static double calculateTrigramMLProbability(String x0, String x1, 
String x2, Iterable<StringList> set) {
+    return calculateNgramMLProbability(new StringList(x0, x1, x2), set);
+  }
+
+  /**
+   * calculate the probability of a ngram in a vocabulary using maximum 
likelihood estimation
+   *
+   * @param ngram a ngram
+   * @param set   the vocabulary
+   * @return the maximum likelihood probability
+   */
+  public static double calculateNgramMLProbability(StringList ngram, 
Iterable<StringList> set) {
+    StringList ngramMinusOne = getNMinusOneTokenFirst(ngram);
+    return count(ngram, set) / count(ngramMinusOne, set);
+  }
+
+  /**
+   * calculate the probability of a bigram in a vocabulary using prior Laplace 
smoothing algorithm
+   *
+   * @param x0  the first word in the bigram
+   * @param x1  the second word in the bigram
+   * @param set the vocabulary
+   * @param k   the smoothing factor
+   * @return the prior Laplace smoothiing probability
+   */
+  public static double calculateBigramPriorSmoothingProbability(String x0, 
String x1, Collection<StringList> set, Double k) {
+    return (count(new StringList(x0, x1), set) + k * 
calculateUnigramMLProbability(x1, set)) /
+        (count(new StringList(x0), set) + k * set.size());
+  }
+
+  /**
+   * calculate the probability of a trigram in a vocabulary using a linear 
interpolation algorithm
+   *
+   * @param x0      the first word in the trigram
+   * @param x1      the second word in the trigram
+   * @param x2      the third word in the trigram
+   * @param set     the vocabulary
+   * @param lambda1 trigram interpolation factor
+   * @param lambda2 bigram interpolation factor
+   * @param lambda3 unigram interpolation factor
+   * @return the linear interpolation probability
+   */
+  public static double calculateTrigramLinearInterpolationProbability(String 
x0, String x1, String x2, Collection<StringList> set,
+                                                                      Double 
lambda1, Double lambda2, Double lambda3) {
+    assert lambda1 + lambda2 + lambda3 == 1 : "lambdas sum should be equals to 
1";
+    assert lambda1 > 0 && lambda2 > 0 && lambda3 > 0 : "lambdas should all be 
greater than 0";
+
+    return lambda1 * calculateTrigramMLProbability(x0, x1, x2, set) +
+        lambda2 * calculateBigramMLProbability(x1, x2, set) +
+        lambda3 * calculateUnigramMLProbability(x2, set);
+
+  }
+
+  /**
+   * calculate the probability of a ngram in a vocabulary using the missing 
probability mass algorithm
+   *
+   * @param ngram    the ngram
+   * @param discount discount factor
+   * @param set      the vocabulary
+   * @return the probability
+   */
+  public static double calculateMissingNgramProbabilityMass(StringList ngram, 
Double discount, Iterable<StringList> set) {
+    Double missingMass = 0d;
+    Double countWord = count(ngram, set);
+    for (String word : flatSet(set)) {
+      missingMass += (count(getNPlusOneNgram(ngram, word), set) - discount) / 
countWord;
+    }
+    return 1 - missingMass;
+  }
+
+  /**
+   * get the (n-1)th ngram of a given ngram, that is the same ngram except the 
last word in the ngram
+   *
+   * @param ngram a ngram
+   * @return a ngram
+   */
+  public static StringList getNMinusOneTokenFirst(StringList ngram) {
+    String[] tokens = new String[ngram.size() - 1];
+    for (int i = 0; i < ngram.size() - 1; i++) {
+      tokens[i] = ngram.getToken(i);
+    }
+    return tokens.length > 0 ? new StringList(tokens) : null;
+  }
+
+  /**
+   * get the (n-1)th ngram of a given ngram, that is the same ngram except the 
first word in the ngram
+   *
+   * @param ngram a ngram
+   * @return a ngram
+   */
+  public static StringList getNMinusOneTokenLast(StringList ngram) {
+    String[] tokens = new String[ngram.size() - 1];
+    for (int i = 1; i < ngram.size(); i++) {
+      tokens[i - 1] = ngram.getToken(i);
+    }
+    return tokens.length > 0 ? new StringList(tokens) : null;
+  }
+
+  private static StringList getNPlusOneNgram(StringList ngram, String word) {
+    String[] tokens = new String[ngram.size() + 1];
+    for (int i = 0; i < ngram.size(); i++) {
+      tokens[i] = ngram.getToken(i);
+    }
+    tokens[tokens.length - 1] = word;
+    return new StringList(tokens);
+  }
+
+  private static Double count(StringList ngram, Iterable<StringList> 
sentences) {
+    Double count = 0d;
+    for (StringList sentence : sentences) {
+      int idx0 = indexOf(sentence, ngram.getToken(0));
+      if (idx0 >= 0 && sentence.size() >= idx0 + ngram.size()) {
+        boolean match = true;
+        for (int i = 1; i < ngram.size(); i++) {
+          String sentenceToken = sentence.getToken(idx0 + i);
+          String ngramToken = ngram.getToken(i);
+          match &= sentenceToken.equals(ngramToken);
+        }
+        if (match) {
+          count++;
+        }
+      }
+    }
+    return count;
+  }
+
+  private static int indexOf(StringList sentence, String token) {
+    for (int i = 0; i < sentence.size(); i++) {
+      if (token.equals(sentence.getToken(i))) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  private static Collection<String> flatSet(Iterable<StringList> set) {
+    Collection<String> flatSet = new HashSet<>();
+    for (StringList sentence : set) {
+      for (String word : sentence) {
+        flatSet.add(word);
+      }
+    }
+    return flatSet;
+  }
+
+  /**
+   * get the ngrams of dimension n of a certain input sequence of tokens
+   *
+   * @param sequence a sequence of tokens
+   * @param size     the size of the resulting ngrmams
+   * @return all the possible ngrams of the given size derivable from the 
input sequence
+   */
+  public static Collection<StringList> getNGrams(StringList sequence, int 
size) {
+    Collection<StringList> ngrams = new LinkedList<>();
+    if (size == -1 || size >= sequence.size()) {
+      ngrams.add(sequence);
+    } else {
+      String[] ngram = new String[size];
+      for (int i = 0; i < sequence.size() - size + 1; i++) {
+        ngram[0] = sequence.getToken(i);
+        for (int j = 1; j < size; j++) {
+          ngram[j] = sequence.getToken(i + j);
+        }
+        ngrams.add(new StringList(ngram));
+      }
+    }
+
+    return ngrams;
+  }
+
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java?rev=1729193&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java
 Mon Feb  8 15:52:08 2016
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package opennlp.tools.languagemodel;
+
+import java.util.Collection;
+
+import opennlp.tools.util.StringList;
+import org.junit.Test;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests for evaluating accuracy of language models
+ */
+public class LanguageModelEvaluationTest {
+
+  @Test
+  public void testPerplexityComparison() throws Exception {
+
+    Collection<StringList> trainingVocabulary = 
LanguageModelTestUtils.generateRandomVocabulary(1100000);
+    Collection<StringList> testVocabulary = 
LanguageModelTestUtils.generateRandomVocabulary(100);
+
+    NGramLanguageModel unigramLM = new NGramLanguageModel(1);
+    for (StringList sentence : trainingVocabulary) {
+      unigramLM.add(sentence, 1, 1);
+    }
+    double unigramPerplexity = LanguageModelTestUtils.getPerplexity(unigramLM, 
testVocabulary, 1);
+
+    NGramLanguageModel bigramLM = new NGramLanguageModel(2);
+    for (StringList sentence : trainingVocabulary) {
+      bigramLM.add(sentence, 1, 2);
+    }
+    double bigramPerplexity = LanguageModelTestUtils.getPerplexity(bigramLM, 
testVocabulary, 2);
+    assertTrue(unigramPerplexity >= bigramPerplexity);
+
+    NGramLanguageModel trigramLM = new NGramLanguageModel(3);
+    for (StringList sentence : trainingVocabulary) {
+      trigramLM.add(sentence, 2, 3);
+    }
+    double trigramPerplexity = LanguageModelTestUtils.getPerplexity(trigramLM, 
testVocabulary, 3);
+    assertTrue(bigramPerplexity >= trigramPerplexity);
+
+  }
+
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelEvaluationTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java?rev=1729193&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java
 Mon Feb  8 15:52:08 2016
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package opennlp.tools.languagemodel;
+
+import java.math.BigDecimal;
+import java.math.MathContext;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.Random;
+
+import opennlp.tools.ngram.NGramUtils;
+import opennlp.tools.util.StringList;
+import org.junit.Ignore;
+
+/**
+ * Utility class for language models tests
+ */
+@Ignore
+public class LanguageModelTestUtils {
+
+  private static final java.math.MathContext CONTEXT = MathContext.DECIMAL128;
+  private static Random r = new Random();
+
+  private static final char[] chars = new char[]{'a', 'b', 'c', 'd', 'e', 'f', 
'g', 'h', 'i', 'j'};
+
+  public static Collection<StringList> generateRandomVocabulary(int size) {
+    Collection<StringList> vocabulary = new LinkedList<>();
+    for (int i = 0; i < size; i++) {
+      StringList sentence = generateRandomSentence();
+      vocabulary.add(sentence);
+    }
+    return vocabulary;
+  }
+
+  public static StringList generateRandomSentence() {
+    int dimension = r.nextInt(10) + 1;
+    String[] sentence = new String[dimension];
+    for (int j = 0; j < dimension; j++) {
+      int i = r.nextInt(10);
+      char c = chars[i];
+      sentence[j] = c + "-" + c + "-" + c;
+    }
+    return new StringList(sentence);
+  }
+
+  public static double getPerplexity(LanguageModel lm, Collection<StringList> 
testSet, int ngramSize) throws ArithmeticException {
+    BigDecimal perplexity = new BigDecimal(1d);
+
+    for (StringList sentence : testSet) {
+      for (StringList ngram : NGramUtils.getNGrams(sentence, ngramSize)) {
+        double ngramProbability = lm.calculateProbability(ngram);
+        perplexity = perplexity.multiply(new BigDecimal(1d).divide(new 
BigDecimal(ngramProbability), CONTEXT));
+      }
+    }
+
+    double p = Math.log(perplexity.doubleValue());
+    if (Double.isInfinite(p) || Double.isNaN(p)) {
+      return Double.POSITIVE_INFINITY; // over/underflow -> too high perplexity
+    } else {
+      BigDecimal log = new BigDecimal(p);
+      return Math.pow(Math.E, log.divide(new BigDecimal(testSet.size()), 
CONTEXT).doubleValue());
+    }
+  }
+
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/LanguageModelTestUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java?rev=1729193&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java
 Mon Feb  8 15:52:08 2016
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package opennlp.tools.languagemodel;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.ngram.NGramGenerator;
+import opennlp.tools.util.StringList;
+import org.apache.commons.io.IOUtils;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests for {@link opennlp.tools.languagemodel.NGramLanguageModel}
+ */
+public class NgramLanguageModelTest {
+
+  @Test
+  public void testEmptyVocabularyProbability() throws Exception {
+    NGramLanguageModel model = new NGramLanguageModel();
+    assertEquals("probability with an empty vocabulary is always 0", 0d, 
model.calculateProbability(new StringList("")), 0d);
+    assertEquals("probability with an empty vocabulary is always 0", 0d, 
model.calculateProbability(new StringList("1", "2", "3")), 0d);
+  }
+
+  @Test
+  public void testRandomVocabularyAndSentence() throws Exception {
+    NGramLanguageModel model = new NGramLanguageModel();
+    for (StringList sentence : 
LanguageModelTestUtils.generateRandomVocabulary(10)) {
+      model.add(sentence, 2, 3);
+    }
+    double probability = 
model.calculateProbability(LanguageModelTestUtils.generateRandomSentence());
+    assertTrue("a probability measure should be between 0 and 1 [was " + 
probability + "]", probability >= 0 && probability <= 1);
+  }
+
+  @Test
+  public void testNgramModel() throws Exception {
+    NGramLanguageModel model = new NGramLanguageModel(4);
+    model.add(new StringList("I", "saw", "the", "fox"), 1, 4);
+    model.add(new StringList("the", "red", "house"), 1, 4);
+    model.add(new StringList("I", "saw", "something", "nice"), 1, 2);
+    double probability = model.calculateProbability(new StringList("I", "saw", 
"the", "red", "house"));
+    assertTrue("a probability measure should be between 0 and 1 [was " + 
probability + "]", probability >= 0 && probability <= 1);
+
+    StringList tokens = model.predictNextTokens(new StringList("I", "saw"));
+    assertNotNull(tokens);
+    assertEquals(new StringList("the", "fox"), tokens);
+  }
+
+  @Test
+  public void testBigramProbabilityNoSmoothing() throws Exception {
+    NGramLanguageModel model = new NGramLanguageModel(2, 0);
+    model.add(new StringList("<s>", "I", "am", "Sam", "</s>"), 1, 2);
+    model.add(new StringList("<s>", "Sam", "I", "am", "</s>"), 1, 2);
+    model.add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", 
"and", "ham", "</s>"), 1, 2);
+    double probability = model.calculateProbability(new StringList("<s>", 
"I"));
+    assertEquals(0.666d, probability, 0.001);
+    probability = model.calculateProbability(new StringList("Sam", "</s>"));
+    assertEquals(0.5d, probability, 0.001);
+    probability = model.calculateProbability(new StringList("<s>", "Sam"));
+    assertEquals(0.333d, probability, 0.001);
+    probability = model.calculateProbability(new StringList("am", "Sam"));
+    assertEquals(0.5d, probability, 0.001);
+    probability = model.calculateProbability(new StringList("I", "am"));
+    assertEquals(0.666d, probability, 0.001);
+    probability = model.calculateProbability(new StringList("I", "do"));
+    assertEquals(0.333d, probability, 0.001);
+    probability = model.calculateProbability(new StringList("I", "am", "Sam"));
+    assertEquals(0.333d, probability, 0.001);
+  }
+
+  @Test
+  public void testTrigram() throws Exception {
+    NGramLanguageModel model = new NGramLanguageModel(3);
+    model.add(new StringList("I", "see", "the", "fox"), 2, 3);
+    model.add(new StringList("the", "red", "house"), 2, 3);
+    model.add(new StringList("I", "saw", "something", "nice"), 2, 3);
+    double probability = model.calculateProbability(new StringList("I", "saw", 
"the", "red", "house"));
+    assertTrue("a probability measure should be between 0 and 1 [was " + 
probability + "]", probability >= 0 && probability <= 1);
+
+    StringList tokens = model.predictNextTokens(new StringList("I", "saw"));
+    assertNotNull(tokens);
+    assertEquals(new StringList("something", "nice"), tokens);
+  }
+
+  @Test
+  public void testBigram() throws Exception {
+    NGramLanguageModel model = new NGramLanguageModel(2);
+    model.add(new StringList("I", "see", "the", "fox"), 1, 2);
+    model.add(new StringList("the", "red", "house"), 1, 2);
+    model.add(new StringList("I", "saw", "something", "nice"), 1, 2);
+    double probability = model.calculateProbability(new StringList("I", "saw", 
"the", "red", "house"));
+    assertTrue("a probability measure should be between 0 and 1 [was " + 
probability + "]", probability >= 0 && probability <= 1);
+
+    StringList tokens = model.predictNextTokens(new StringList("I", "saw"));
+    assertNotNull(tokens);
+    assertEquals(new StringList("something"), tokens);
+  }
+
+  @Test
+  public void testSerializedNGramLanguageModel() throws Exception {
+    NGramLanguageModel languageModel = new 
NGramLanguageModel(getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml"),
 3);
+    double probability = languageModel.calculateProbability(new 
StringList("The", "brown", "fox", "jumped"));
+    assertTrue("a probability measure should be between 0 and 1 [was " + 
probability + "]", probability >= 0 && probability <= 1);
+    StringList tokens = languageModel.predictNextTokens(new StringList("fox"));
+    assertNotNull(tokens);
+    assertEquals(new StringList("jumped"), tokens);
+  }
+
+  @Test
+  public void testTrigramLanguageModelCreationFromText() throws Exception {
+    int ngramSize = 3;
+    NGramLanguageModel languageModel = new NGramLanguageModel(ngramSize);
+    InputStream stream = 
getClass().getResourceAsStream("/opennlp/tools/languagemodel/sentences.txt");
+    for (String line : IOUtils.readLines(stream)) {
+      String[] array = line.split(" ");
+      List<String> split = Arrays.asList(array);
+      List<String> generatedStrings = NGramGenerator.generate(split, 
ngramSize, " ");
+      for (String generatedString : generatedStrings) {
+        String[] tokens = generatedString.split(" ");
+        if (tokens.length > 0) {
+          languageModel.add(new StringList(tokens), 1, ngramSize);
+        }
+      }
+    }
+    StringList tokens = languageModel.predictNextTokens(new 
StringList("neural", "network", "language"));
+    assertNotNull(tokens);
+    assertEquals(new StringList("models"), tokens);
+    double p1 = languageModel.calculateProbability(new StringList("neural", 
"network", "language", "models"));
+    double p2 = languageModel.calculateProbability(new StringList("neural", 
"network", "language", "model"));
+    assertTrue(p1 > p2);
+  }
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/languagemodel/NgramLanguageModelTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramUtilsTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramUtilsTest.java?rev=1729193&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramUtilsTest.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramUtilsTest.java
 Mon Feb  8 15:52:08 2016
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package opennlp.tools.ngram;
+
+import java.util.Collection;
+import java.util.LinkedList;
+import opennlp.tools.util.StringList;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests for {@link NGramUtils}
+ */
+public class NGramUtilsTest {
+
+  @Test
+  public void testBigramMLProbability() {
+    Collection<StringList> set = new LinkedList<StringList>();
+    set.add(new StringList("<s>", "I", "am", "Sam", "</s>"));
+    set.add(new StringList("<s>", "Sam", "I", "am", "</s>"));
+    set.add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", 
"and", "ham", "</s>"));
+    set.add(new StringList(""));
+    Double d = NGramUtils.calculateBigramMLProbability("<s>", "I", set);
+    assertEquals(Double.valueOf(0.6666666666666666d), d);
+    d = NGramUtils.calculateBigramMLProbability("Sam", "</s>", set);
+    assertEquals(Double.valueOf(0.5d), d);
+    d = NGramUtils.calculateBigramMLProbability("<s>", "Sam", set);
+    assertEquals(Double.valueOf(0.3333333333333333d), d);
+  }
+
+  @Test
+  public void testTrigramMLProbability() {
+    Collection<StringList> set = new LinkedList<StringList>();
+    set.add(new StringList("<s>", "I", "am", "Sam", "</s>"));
+    set.add(new StringList("<s>", "Sam", "I", "am", "</s>"));
+    set.add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", 
"and", "ham", "</s>"));
+    set.add(new StringList(""));
+    Double d = NGramUtils.calculateTrigramMLProbability("I", "am", "Sam", set);
+    assertEquals(Double.valueOf(0.5), d);
+    d = NGramUtils.calculateTrigramMLProbability("Sam", "I", "am", set);
+    assertEquals(Double.valueOf(1d), d);
+  }
+
+  @Test
+  public void testNgramMLProbability() {
+    Collection<StringList> set = new LinkedList<StringList>();
+    set.add(new StringList("<s>", "I", "am", "Sam", "</s>"));
+    set.add(new StringList("<s>", "Sam", "I", "am", "</s>"));
+    set.add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", 
"and", "ham", "</s>"));
+    set.add(new StringList(""));
+    Double d = NGramUtils.calculateNgramMLProbability(new StringList("I", 
"am", "Sam"), set);
+    assertEquals(Double.valueOf(0.5), d);
+    d = NGramUtils.calculateNgramMLProbability(new StringList("Sam", "I", 
"am"), set);
+    assertEquals(Double.valueOf(1d), d);
+  }
+
+  @Test
+  public void testLinearInterpolation() throws Exception {
+    Collection<StringList> set = new LinkedList<StringList>();
+    set.add(new StringList("the", "green", "book", "STOP"));
+    set.add(new StringList("my", "blue", "book", "STOP"));
+    set.add(new StringList("his", "green", "house", "STOP"));
+    set.add(new StringList("book", "STOP"));
+    Double lambda = 1d / 3d;
+    Double d = 
NGramUtils.calculateTrigramLinearInterpolationProbability("the", "green", 
"book", set, lambda, lambda, lambda);
+    assertNotNull(d);
+    assertEquals("wrong result", Double.valueOf(0.5714285714285714d), d);
+  }
+
+  @Test
+  public void testLinearInterpolation2() throws Exception {
+    Collection<StringList> set = new LinkedList<StringList>();
+    set.add(new StringList("D", "N", "V", "STOP"));
+    set.add(new StringList("D", "N", "V", "STOP"));
+    Double lambda = 1d / 3d;
+    Double d = NGramUtils.calculateTrigramLinearInterpolationProbability("N", 
"V", "STOP", set, lambda, lambda, lambda);
+    assertNotNull(d);
+    assertEquals("wrong result", Double.valueOf(0.75d), d);
+  }
+
+  @Test
+  public void testGetNGrams() throws Exception {
+    Collection<StringList> nGrams = NGramUtils.getNGrams(new StringList("I", 
"saw", "brown", "fox"), 2);
+    assertEquals(3, nGrams.size());
+    nGrams = NGramUtils.getNGrams(new StringList("I", "saw", "brown", "fox"), 
3);
+    assertEquals(2, nGrams.size());
+  }
+
+}

Propchange: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramUtilsTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt?rev=1729193&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
 Mon Feb  8 15:52:08 2016
@@ -0,0 +1,52 @@
+The word2vec software of Tomas Mikolov and colleagues has gained a lot of 
traction lately and provides state-of-the-art word embeddings
+The learning models behind the software are described in two research papers
+We found the description of the models in these papers to be somewhat cryptic 
and hard to follow
+While the motivations and presentation may be obvious to the neural-networks 
language-mofdeling crowd we had to struggle quite a bit to figure out the 
rationale behind the equations
+This note is an attempt to explain the negative sampling equation in 
Distributed Representations of Words and Phrases and their Compositionality by 
Tomas Mikolov Ilya Sutskever Kai Chen Greg Corrado and Jeffrey Dean
+The departure point of the paper is the skip-gram model
+In this model we are given a corpus of words w and their contexts c
+We consider the conditional probabilities p(c|w) and given a corpus Text the 
goal is to set the parameters Î¸ of p(c|w;Î¸) so as to maximize the corpus 
probability
+The recently introduced continuous Skip-gram model is an efficient method for 
learning high-quality distributed vector representations that capture a large 
number of precise syntactic and semantic word relationships
+In this paper we present several extensions that improve both the quality of 
the vectors and the training speed
+By subsampling of the frequent words we obtain significant speedup and also 
learn more regular word representations
+We also describe a simple alternative to the hierarchical softmax called 
negative sampling
+An inherent limitation of word representations is their indifference to word 
order and their inability to represent idiomatic phrases
+For example the meanings of Canada and Air cannot be easily combined to obtain 
Air Canada
+Motivated by this example we present a simple method for finding phrases in 
text and show that learning good vector representations for millions of phrases 
is possible
+The similarity metrics used for nearest neighbor evaluations produce a single 
scalar that quantifies the relatedness of two words
+This simplicity can be problematic since two given words almost always exhibit 
more intricate relationships than can be captured by a single number
+For example man may be regarded as similar to woman in that both words 
describe human beings on the other hand the two words are often considered 
opposites since they highlight a primary axis along which humans differ from 
one another
+In order to capture in a quantitative way the nuance necessary to distinguish 
man from woman it is necessary for a model to associate more than a single 
number to the word pair
+A natural and simple candidate for an enlarged set of discriminative numbers 
is the vector difference between the two word vectors
+GloVe is designed in order that such vector differences capture as much as 
possible the meaning specified by the juxtaposition of two words
+Unsupervised word representations are very useful in NLP tasks both as inputs 
to learning algorithms and as extra word features in NLP systems
+However most of these models are built with only local context and one 
representation per word
+This is problematic because words are often polysemous and global context can 
also provide useful information for learning word meanings
+We present a new neural network architecture which 1) learns word embeddings 
that better capture the semantics of words by incorporating both local and 
global document context and 2) accounts for homonymy and polysemy by learning 
multiple embeddings per word
+We introduce a new dataset with human judgments on pairs of words in 
sentential context and evaluate our model on it showing that our model 
outperforms competitive baselines and other neural language models
+Information Retrieval (IR) models need to deal with two difficult issues 
vocabulary mismatch and term dependencies
+Vocabulary mismatch corresponds to the difficulty of retrieving relevant 
documents that do not contain exact query terms but semantically related terms
+Term dependencies refers to the need of considering the relationship between 
the words of the query when estimating the relevance of a document
+A multitude of solutions has been proposed to solve each of these two problems 
but no principled model solve both
+In parallel in the last few years language models based on neural networks 
have been used to cope with complex natural language processing tasks like 
emotion and paraphrase detection
+Although they present good abilities to cope with both term dependencies and 
vocabulary mismatch problems thanks to the distributed representation of words 
they are based upon such models could not be used readily in IR where the 
estimation of one language model per document (or query) is required
+This is both computationally unfeasible and prone to over-fitting
+Based on a recent work that proposed to learn a generic language model that 
can be modified through a set of document-specific parameters we explore use of 
new neural network models that are adapted to ad-hoc IR tasks
+Within the language model IR framework we propose and study the use of a 
generic language model as well as a document-specific language model
+Both can be used as a smoothing component but the latter is more adapted to 
the document at hand and has the potential of being used as a full document 
language model
+We experiment with such models and analyze their results on TREC-1 to 8 
datasets
+The word2vec model and application by Mikolov et al have attracted a great 
amount of attention in recent two years
+The vector representations of words learned by word2vec models have been 
proven to be able to carry semantic meanings and are useful in various NLP tasks
+As an increasing number of researchers would like to experiment with word2vec 
I notice that there lacks a material that comprehensively explains the 
parameter learning process of word2vec in details thus preventing many people 
with less neural network experience from understanding how exactly word2vec 
works
+This note provides detailed derivations and explanations of the parameter 
update equations for the word2vec models including the original continuous 
bag-of-word (CBOW) and skip-gram models as well as advanced tricks hierarchical 
soft-max and negative sampling
+In the appendix a review is given on the basics of neuron network models and 
backpropagation
+To avoid the inaccuracy caused by classifying the example into several 
categories given by TREC manually we take the word2vec to represent all 
attractions and user contexts in the continuous vector space learnt by neural 
network language models
+The base of NNML is using neural networks for the probability function
+The model learns simultaneously a distributed representation for each word 
along with the probability function for word sequences expressed in terms of 
these representations
+Training such large models we propose continuous bag of words as our framework 
and soft-max as the active function
+So we use the word2vec to train wikitravel corpus and got the word vector
+To avoid the curse of dimensionality by learning a distributed representation 
for words as our word vector we define a test set that compare different 
dimensionality of vectors for our task using the same training data and using 
the same model architecture
+We extend the word2vec framework to capture meaning across languages
+The input consists of a source text and a word-aligned parallel text in a 
second language
+The joint word2vec tool then represents words in both languages within a 
common âsemanticâ vector space
+The result can be used to enrich lexicons of under-resourced languages to 
identify ambiguities and to perform clustering and classification
\ No newline at end of file

Propchange: 
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/languagemodel/sentences.txt
------------------------------------------------------------------------------
    svn:eol-style = native

svn commit: r1729193 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/languagemodel/ main/java/opennlp/tools/ngram/ test/java/opennlp/tools/languagemodel/ test/java/opennlp/tools/ngram/ test/resources/opennlp/tools/languagemodel/

Reply via email to