Author: ragerri
Date: Thu Feb 18 21:02:34 2016
New Revision: 1731145

URL: http://svn.apache.org/viewvc?rev=1731145&view=rev
Log:
OPENNLP-760 adding factory and string utils to induce lemma classes

Added:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java?rev=1731145&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
 Thu Feb 18 21:02:34 2016
@@ -0,0 +1,49 @@
+package opennlp.tools.lemmatizer;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+
+
+/**
+ * Reads data for training and testing. The format consists of:
+ * word\tabpostag\tablemma.
+ * @version 2016-02-16
+ */
+public class LemmaSampleStream extends FilterObjectStream<String, LemmaSample> 
{
+
+  public LemmaSampleStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  public LemmaSample read() throws IOException {
+
+    List<String> toks = new ArrayList<String>();
+    List<String> tags = new ArrayList<String>();
+    List<String> preds = new ArrayList<String>();
+
+    for (String line = samples.read(); line != null && !line.equals(""); line 
= samples.read()) {
+      String[] parts = line.split("\t");
+      if (parts.length != 3) {
+        System.err.println("Skipping corrupt line: " + line);
+      }
+      else {
+        toks.add(parts[0]);
+        tags.add(parts[1]);
+        String ses = StringUtil.getShortestEditScript(parts[0], parts[2]);
+        preds.add(ses);
+      }
+    }
+    if (toks.size() > 0) {
+      LemmaSample lemmaSample = new LemmaSample(toks.toArray(new 
String[toks.size()]), tags.toArray(new String[tags.size()]), preds.toArray(new 
String[preds.size()]));
+      return lemmaSample;
+    }
+    else {
+      return null;
+    }
+  }
+}

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java?rev=1731145&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
 Thu Feb 18 21:02:34 2016
@@ -0,0 +1,18 @@
+package opennlp.tools.lemmatizer;
+
+/**
+ * The interface for lemmatizers.
+ */
+public interface Lemmatizer {
+
+  /**
+   * Generates lemma tags for the word and postag returning the result in an 
array.
+   *
+   * @param toks an array of the tokens
+   * @param tags an array of the pos tags
+   *
+   * @return an array of lemma classes for each token in the sequence.
+   */
+  public String[] lemmatize(String[] toks, String tags[]);
+
+}

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java?rev=1731145&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
 Thu Feb 18 21:02:34 2016
@@ -0,0 +1,12 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * Interface for the lemmatizer evaluator.
+ * @version 2016-02-18
+ *
+ */
+public interface LemmatizerEvaluationMonitor extends 
EvaluationMonitor<LemmaSample> {
+
+}

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java?rev=1731145&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
 Thu Feb 18 21:02:34 2016
@@ -0,0 +1,88 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * The {@link LemmatizerEvaluator} measures the performance of
+ * the given {@link Lemmatizer} with the provided reference
+ * {@link LemmaSample}s.
+ */
+public class LemmatizerEvaluator extends Evaluator<LemmaSample> {
+
+  private Lemmatizer lemmatizer;
+
+  private Mean wordAccuracy = new Mean();
+
+  /**
+   * Initializes the current instance.
+   *
+   * @param aLemmatizer a lemmatizer
+   * @param listeners an array of evaluation listeners
+   */
+  public LemmatizerEvaluator(Lemmatizer aLemmatizer, 
LemmatizerEvaluationMonitor ... listeners) {
+    super(listeners);
+    this.lemmatizer = aLemmatizer;
+  }
+
+  /**
+   * Evaluates the given reference {@link LemmaSample} object.
+   *
+   * This is done by tagging the sentence from the reference
+   * {@link LemmaSample} with the {@link Lemmatizer}. The
+   * tags are then used to update the word accuracy score.
+   *
+   * @param reference the reference {@link LemmaSample}.
+   *
+   * @return the predicted {@link LemmaSample}.
+   */
+  @Override
+  protected LemmaSample processSample(LemmaSample reference) {
+
+    String[] predictedLemmas = lemmatizer.lemmatize(reference.getTokens(), 
reference.getTags());
+    String[] referenceLemmas = reference.getLemmas();
+    
+    for (int i = 0; i < referenceLemmas.length; i++) {
+      //System.err.println("-> Reference: " + referenceLemmas[i]);
+      //System.err.println("-> Predicted: " + predictedLemmas[i]);
+      if (referenceLemmas[i].equals(predictedLemmas[i])) {
+        wordAccuracy.add(1);
+      }
+      else {
+        wordAccuracy.add(0);
+      }
+    }
+    return new LemmaSample(reference.getTokens(), reference.getTags(), 
predictedLemmas);
+  }
+
+  /**
+   * Retrieves the word accuracy.
+   *
+   * This is defined as:
+   * word accuracy = correctly detected tags / total words
+   *
+   * @return the word accuracy
+   */
+  public double getWordAccuracy() {
+    return wordAccuracy.mean();
+  }
+
+  /**
+   * Retrieves the total number of words considered
+   * in the evaluation.
+   *
+   * @return the word count
+   */
+  public long getWordCount() {
+    return wordAccuracy.count();
+  }
+
+  /**
+   * Represents this objects as human readable {@link String}.
+   */
+  @Override
+  public String toString() {
+    return "Accuracy:" + wordAccuracy.mean() +
+        " Number of Samples: " + wordAccuracy.count();
+  }
+}

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java?rev=1731145&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
 Thu Feb 18 21:02:34 2016
@@ -0,0 +1,48 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+public class LemmatizerFactory extends BaseToolFactory {
+
+  /**
+   * Creates a {@link LemmatizerFactory} that provides the default 
implementation
+   * of the resources.
+   */
+  public LemmatizerFactory() {
+  }
+
+  public static LemmatizerFactory create(String subclassName)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new LemmatizerFactory();
+    }
+    try {
+      LemmatizerFactory theFactory = ExtensionLoader.instantiateExtension(
+          LemmatizerFactory.class, subclassName);
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      System.err.println(msg);
+      e.printStackTrace();
+      throw new InvalidFormatException(msg, e);
+    }
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // no additional artifacts
+  }
+
+  public SequenceValidator<String> getSequenceValidator() {
+    return new DefaultLemmatizerSequenceValidator();
+  }
+
+  public LemmatizerContextGenerator getContextGenerator() {
+    return new DefaultLemmatizerContextGenerator();
+  }
+}

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java?rev=1731145&r1=1731144&r2=1731145&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java 
(original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java 
Thu Feb 18 21:02:34 2016
@@ -113,4 +113,143 @@ public class StringUtil {
   public static boolean isEmpty(CharSequence theString) {
        return theString.length() == 0;
   }
+  
+  /**
+   * Get mininum of three values.
+   * @param a number a
+   * @param b number b
+   * @param c number c
+   * @return the minimum
+   */
+  private static int minimum(int a, int b, int c) {
+      int minValue;
+      minValue = a;
+      if (b < minValue) {
+        minValue = b;
+      }
+      if (c < minValue) {
+        minValue = c;
+      }
+      return minValue;
+  }
+  
+  /**
+   * Computes the Levenshtein distance of two strings in a matrix.
+   * Based on pseudo-code provided here:
+   * 
https://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Levenshtein_distance
+   * which in turn is based on the paper Wagner, Robert A.; Fischer, Michael 
J. (1974),
+   * "The String-to-String Correction Problem", Journal of the ACM 21 (1): 
168-173
+   * @param wordForm the form
+   * @param lemma the lemma
+   * @return the distance
+   */
+  public static int[][] levenshteinDistance(String wordForm, String lemma) {
+
+    int wordLength = wordForm.length();
+    int lemmaLength = lemma.length();
+    int cost;
+    int[][] distance = new int[wordLength + 1][lemmaLength + 1];
+    
+    if (wordLength == 0) {
+      return distance;
+    }
+    if (lemmaLength == 0) {
+      return distance;
+    }
+    //fill in the rows of column 0
+    for (int i = 0; i <= wordLength; i++) {
+      distance[i][0] = i;
+    }
+    //fill in the columns of row 0
+    for (int j = 0; j <= lemmaLength; j++) {
+      distance[0][j] = j;
+    }
+    //fill in the rest of the matrix calculating the minimum distance
+    for (int i = 1; i <= wordLength; i++) {
+      int s_i = wordForm.charAt(i - 1);
+      for (int j = 1; j <= lemmaLength; j++) {
+        if (s_i == lemma.charAt(j - 1)) {
+          cost = 0;
+        } else {
+          cost = 1;
+        }
+        //obtain minimum distance from calculating deletion, insertion, 
substitution
+        distance[i][j] = minimum(distance[i - 1][j] + 1, distance[i][j - 1] + 
1, distance[i - 1][j - 1] + cost);
+      }
+    }
+    return distance;
+  }
+  
+  /**
+   * Computes the Shortest Edit Script (SES) to convert a word into its lemma.
+   * This is based on Chrupala's PhD thesis (2008).
+ * @param wordForm the token
+ * @param lemma the target lemma
+ * @param distance the levenshtein distance
+ * @param permutations the number of permutations
+ */
+public static void computeShortestEditScript(String wordForm, String lemma, 
int[][] distance, StringBuffer permutations) {
+    
+    int n = distance.length;
+    int m = distance[0].length;
+    
+    int wordFormLength = n - 1;
+    int lemmaLength = m - 1;
+    while(true) {
+        
+        if (distance[wordFormLength][lemmaLength] == 0) {
+          break;
+        }
+        if ((lemmaLength > 0 && wordFormLength > 0) && 
(distance[wordFormLength - 1][lemmaLength - 1] < 
distance[wordFormLength][lemmaLength])) {
+            permutations.append('R').append(Integer.toString(wordFormLength - 
1)).append(wordForm.charAt(wordFormLength - 1)).append(lemma.charAt(lemmaLength 
- 1));
+            lemmaLength--;
+            wordFormLength--;
+            continue;
+        }
+        if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1] < 
distance[wordFormLength][lemmaLength])) {
+            
permutations.append('I').append(Integer.toString(wordFormLength)).append(lemma.charAt(lemmaLength
 - 1));
+            lemmaLength--;
+            continue;
+        }
+        if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength] < 
distance[wordFormLength][lemmaLength])) {
+            permutations.append('D').append(Integer.toString(wordFormLength - 
1)).append(wordForm.charAt(wordFormLength - 1));
+            wordFormLength--;
+            continue;
+        }
+        if ((wordFormLength > 0 && lemmaLength > 0) && 
(distance[wordFormLength - 1][lemmaLength - 1] == 
distance[wordFormLength][lemmaLength])) {
+            wordFormLength--; lemmaLength--;
+            continue ;
+        }
+        if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength] 
== distance[wordFormLength][lemmaLength])) {
+            wordFormLength--;
+            continue;
+        }
+        if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1] == 
distance[wordFormLength][lemmaLength])) {
+            lemmaLength--;
+            continue;
+        }   
+    }
+}
+
+/**
+ * Get the SES required to go from a word to a lemma.
+ * @param wordForm the word
+ * @param lemma the lemma
+ * @return the shortest edit script
+ */
+public static String getShortestEditScript(String wordForm, String lemma) {
+  String reversedWF = new 
StringBuffer(wordForm.toLowerCase()).reverse().toString();
+  String reversedLemma = new 
StringBuffer(lemma.toLowerCase()).reverse().toString();
+  StringBuffer permutations = new StringBuffer();
+  String ses;
+  if (!reversedWF.equals(reversedLemma)) {
+    int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, 
reversedLemma);
+    StringUtil.computeShortestEditScript(reversedWF, reversedLemma, 
levenDistance, permutations);
+    ses = permutations.toString();
+  } else {
+    ses = "O";
+  }
+  return ses;
+}
+
 }


Reply via email to