Author: ragerri
Date: Thu Feb 18 21:02:34 2016
New Revision: 1731145
URL: http://svn.apache.org/viewvc?rev=1731145&view=rev
Log:
OPENNLP-760 adding factory and string utils to induce lemma classes
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java?rev=1731145&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java
Thu Feb 18 21:02:34 2016
@@ -0,0 +1,49 @@
+package opennlp.tools.lemmatizer;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+
+
+/**
+ * Reads data for training and testing. The format consists of:
+ * word\tabpostag\tablemma.
+ * @version 2016-02-16
+ */
+public class LemmaSampleStream extends FilterObjectStream<String, LemmaSample>
{
+
+ public LemmaSampleStream(ObjectStream<String> samples) {
+ super(samples);
+ }
+
+ public LemmaSample read() throws IOException {
+
+ List<String> toks = new ArrayList<String>();
+ List<String> tags = new ArrayList<String>();
+ List<String> preds = new ArrayList<String>();
+
+ for (String line = samples.read(); line != null && !line.equals(""); line
= samples.read()) {
+ String[] parts = line.split("\t");
+ if (parts.length != 3) {
+ System.err.println("Skipping corrupt line: " + line);
+ }
+ else {
+ toks.add(parts[0]);
+ tags.add(parts[1]);
+ String ses = StringUtil.getShortestEditScript(parts[0], parts[2]);
+ preds.add(ses);
+ }
+ }
+ if (toks.size() > 0) {
+ LemmaSample lemmaSample = new LemmaSample(toks.toArray(new
String[toks.size()]), tags.toArray(new String[tags.size()]), preds.toArray(new
String[preds.size()]));
+ return lemmaSample;
+ }
+ else {
+ return null;
+ }
+ }
+}
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java?rev=1731145&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java
Thu Feb 18 21:02:34 2016
@@ -0,0 +1,18 @@
+package opennlp.tools.lemmatizer;
+
+/**
+ * The interface for lemmatizers.
+ */
+public interface Lemmatizer {
+
+ /**
+ * Generates lemma tags for the word and postag returning the result in an
array.
+ *
+ * @param toks an array of the tokens
+ * @param tags an array of the pos tags
+ *
+ * @return an array of lemma classes for each token in the sequence.
+ */
+ public String[] lemmatize(String[] toks, String tags[]);
+
+}
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java?rev=1731145&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluationMonitor.java
Thu Feb 18 21:02:34 2016
@@ -0,0 +1,12 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * Interface for the lemmatizer evaluator.
+ * @version 2016-02-18
+ *
+ */
+public interface LemmatizerEvaluationMonitor extends
EvaluationMonitor<LemmaSample> {
+
+}
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java?rev=1731145&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerEvaluator.java
Thu Feb 18 21:02:34 2016
@@ -0,0 +1,88 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * The {@link LemmatizerEvaluator} measures the performance of
+ * the given {@link Lemmatizer} with the provided reference
+ * {@link LemmaSample}s.
+ */
+public class LemmatizerEvaluator extends Evaluator<LemmaSample> {
+
+ private Lemmatizer lemmatizer;
+
+ private Mean wordAccuracy = new Mean();
+
+ /**
+ * Initializes the current instance.
+ *
+ * @param aLemmatizer a lemmatizer
+ * @param listeners an array of evaluation listeners
+ */
+ public LemmatizerEvaluator(Lemmatizer aLemmatizer,
LemmatizerEvaluationMonitor ... listeners) {
+ super(listeners);
+ this.lemmatizer = aLemmatizer;
+ }
+
+ /**
+ * Evaluates the given reference {@link LemmaSample} object.
+ *
+ * This is done by tagging the sentence from the reference
+ * {@link LemmaSample} with the {@link Lemmatizer}. The
+ * tags are then used to update the word accuracy score.
+ *
+ * @param reference the reference {@link LemmaSample}.
+ *
+ * @return the predicted {@link LemmaSample}.
+ */
+ @Override
+ protected LemmaSample processSample(LemmaSample reference) {
+
+ String[] predictedLemmas = lemmatizer.lemmatize(reference.getTokens(),
reference.getTags());
+ String[] referenceLemmas = reference.getLemmas();
+
+ for (int i = 0; i < referenceLemmas.length; i++) {
+ //System.err.println("-> Reference: " + referenceLemmas[i]);
+ //System.err.println("-> Predicted: " + predictedLemmas[i]);
+ if (referenceLemmas[i].equals(predictedLemmas[i])) {
+ wordAccuracy.add(1);
+ }
+ else {
+ wordAccuracy.add(0);
+ }
+ }
+ return new LemmaSample(reference.getTokens(), reference.getTags(),
predictedLemmas);
+ }
+
+ /**
+ * Retrieves the word accuracy.
+ *
+ * This is defined as:
+ * word accuracy = correctly detected tags / total words
+ *
+ * @return the word accuracy
+ */
+ public double getWordAccuracy() {
+ return wordAccuracy.mean();
+ }
+
+ /**
+ * Retrieves the total number of words considered
+ * in the evaluation.
+ *
+ * @return the word count
+ */
+ public long getWordCount() {
+ return wordAccuracy.count();
+ }
+
+ /**
+ * Represents this objects as human readable {@link String}.
+ */
+ @Override
+ public String toString() {
+ return "Accuracy:" + wordAccuracy.mean() +
+ " Number of Samples: " + wordAccuracy.count();
+ }
+}
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java?rev=1731145&view=auto
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
(added)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerFactory.java
Thu Feb 18 21:02:34 2016
@@ -0,0 +1,48 @@
+package opennlp.tools.lemmatizer;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+public class LemmatizerFactory extends BaseToolFactory {
+
+ /**
+ * Creates a {@link LemmatizerFactory} that provides the default
implementation
+ * of the resources.
+ */
+ public LemmatizerFactory() {
+ }
+
+ public static LemmatizerFactory create(String subclassName)
+ throws InvalidFormatException {
+ if (subclassName == null) {
+ // will create the default factory
+ return new LemmatizerFactory();
+ }
+ try {
+ LemmatizerFactory theFactory = ExtensionLoader.instantiateExtension(
+ LemmatizerFactory.class, subclassName);
+ return theFactory;
+ } catch (Exception e) {
+ String msg = "Could not instantiate the " + subclassName
+ + ". The initialization throw an exception.";
+ System.err.println(msg);
+ e.printStackTrace();
+ throw new InvalidFormatException(msg, e);
+ }
+ }
+
+ @Override
+ public void validateArtifactMap() throws InvalidFormatException {
+ // no additional artifacts
+ }
+
+ public SequenceValidator<String> getSequenceValidator() {
+ return new DefaultLemmatizerSequenceValidator();
+ }
+
+ public LemmatizerContextGenerator getContextGenerator() {
+ return new DefaultLemmatizerContextGenerator();
+ }
+}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java?rev=1731145&r1=1731144&r2=1731145&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
Thu Feb 18 21:02:34 2016
@@ -113,4 +113,143 @@ public class StringUtil {
public static boolean isEmpty(CharSequence theString) {
return theString.length() == 0;
}
+
+ /**
+ * Get mininum of three values.
+ * @param a number a
+ * @param b number b
+ * @param c number c
+ * @return the minimum
+ */
+ private static int minimum(int a, int b, int c) {
+ int minValue;
+ minValue = a;
+ if (b < minValue) {
+ minValue = b;
+ }
+ if (c < minValue) {
+ minValue = c;
+ }
+ return minValue;
+ }
+
+ /**
+ * Computes the Levenshtein distance of two strings in a matrix.
+ * Based on pseudo-code provided here:
+ *
https://en.wikipedia.org/wiki/Levenshtein_distance#Computing_Levenshtein_distance
+ * which in turn is based on the paper Wagner, Robert A.; Fischer, Michael
J. (1974),
+ * "The String-to-String Correction Problem", Journal of the ACM 21 (1):
168-173
+ * @param wordForm the form
+ * @param lemma the lemma
+ * @return the distance
+ */
+ public static int[][] levenshteinDistance(String wordForm, String lemma) {
+
+ int wordLength = wordForm.length();
+ int lemmaLength = lemma.length();
+ int cost;
+ int[][] distance = new int[wordLength + 1][lemmaLength + 1];
+
+ if (wordLength == 0) {
+ return distance;
+ }
+ if (lemmaLength == 0) {
+ return distance;
+ }
+ //fill in the rows of column 0
+ for (int i = 0; i <= wordLength; i++) {
+ distance[i][0] = i;
+ }
+ //fill in the columns of row 0
+ for (int j = 0; j <= lemmaLength; j++) {
+ distance[0][j] = j;
+ }
+ //fill in the rest of the matrix calculating the minimum distance
+ for (int i = 1; i <= wordLength; i++) {
+ int s_i = wordForm.charAt(i - 1);
+ for (int j = 1; j <= lemmaLength; j++) {
+ if (s_i == lemma.charAt(j - 1)) {
+ cost = 0;
+ } else {
+ cost = 1;
+ }
+ //obtain minimum distance from calculating deletion, insertion,
substitution
+ distance[i][j] = minimum(distance[i - 1][j] + 1, distance[i][j - 1] +
1, distance[i - 1][j - 1] + cost);
+ }
+ }
+ return distance;
+ }
+
+ /**
+ * Computes the Shortest Edit Script (SES) to convert a word into its lemma.
+ * This is based on Chrupala's PhD thesis (2008).
+ * @param wordForm the token
+ * @param lemma the target lemma
+ * @param distance the levenshtein distance
+ * @param permutations the number of permutations
+ */
+public static void computeShortestEditScript(String wordForm, String lemma,
int[][] distance, StringBuffer permutations) {
+
+ int n = distance.length;
+ int m = distance[0].length;
+
+ int wordFormLength = n - 1;
+ int lemmaLength = m - 1;
+ while(true) {
+
+ if (distance[wordFormLength][lemmaLength] == 0) {
+ break;
+ }
+ if ((lemmaLength > 0 && wordFormLength > 0) &&
(distance[wordFormLength - 1][lemmaLength - 1] <
distance[wordFormLength][lemmaLength])) {
+ permutations.append('R').append(Integer.toString(wordFormLength -
1)).append(wordForm.charAt(wordFormLength - 1)).append(lemma.charAt(lemmaLength
- 1));
+ lemmaLength--;
+ wordFormLength--;
+ continue;
+ }
+ if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1] <
distance[wordFormLength][lemmaLength])) {
+
permutations.append('I').append(Integer.toString(wordFormLength)).append(lemma.charAt(lemmaLength
- 1));
+ lemmaLength--;
+ continue;
+ }
+ if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength] <
distance[wordFormLength][lemmaLength])) {
+ permutations.append('D').append(Integer.toString(wordFormLength -
1)).append(wordForm.charAt(wordFormLength - 1));
+ wordFormLength--;
+ continue;
+ }
+ if ((wordFormLength > 0 && lemmaLength > 0) &&
(distance[wordFormLength - 1][lemmaLength - 1] ==
distance[wordFormLength][lemmaLength])) {
+ wordFormLength--; lemmaLength--;
+ continue ;
+ }
+ if (wordFormLength > 0 && (distance[wordFormLength - 1][lemmaLength]
== distance[wordFormLength][lemmaLength])) {
+ wordFormLength--;
+ continue;
+ }
+ if (lemmaLength > 0 && (distance[wordFormLength][lemmaLength - 1] ==
distance[wordFormLength][lemmaLength])) {
+ lemmaLength--;
+ continue;
+ }
+ }
+}
+
+/**
+ * Get the SES required to go from a word to a lemma.
+ * @param wordForm the word
+ * @param lemma the lemma
+ * @return the shortest edit script
+ */
+public static String getShortestEditScript(String wordForm, String lemma) {
+ String reversedWF = new
StringBuffer(wordForm.toLowerCase()).reverse().toString();
+ String reversedLemma = new
StringBuffer(lemma.toLowerCase()).reverse().toString();
+ StringBuffer permutations = new StringBuffer();
+ String ses;
+ if (!reversedWF.equals(reversedLemma)) {
+ int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF,
reversedLemma);
+ StringUtil.computeShortestEditScript(reversedWF, reversedLemma,
levenDistance, permutations);
+ ses = permutations.toString();
+ } else {
+ ses = "O";
+ }
+ return ses;
+}
+
}