StringUtil.java

ragerri Thu, 18 Feb 2016 13:10:13 -0800

Author: ragerri
Date: Thu Feb 18 21:07:48 2016
New Revision: 1731148

URL: http://svn.apache.org/viewvc?rev=1731148&view=rev
Log:
OPENNLP-760 adding learnable lemmatizer and model


Added:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java?rev=1731148&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
 Thu Feb 18 21:07:48 2016
@@ -0,0 +1,189 @@
+package opennlp.tools.lemmatizer;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.ml.BeamSearch;
+import opennlp.tools.ml.EventModelSequenceTrainer;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.SequenceTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.TrainerFactory.TrainerType;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Sequence;
+import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * A probabilistic lemmatizer.  Tries to predict the induced permutation class
+ * for each word depending on its surrounding context. Based on
+ * Grzegorz ChrupaÅa. 2008. Towards a Machine-Learning Architecture 
+ * for Lexical Functional Grammar Parsing. PhD dissertation, Dublin City 
University. 
+ * http://grzegorz.chrupala.me/papers/phd-single.pdf
+ */
+public class LemmatizerME implements Lemmatizer {
+  
+  public static final int DEFAULT_BEAM_SIZE = 3;
+  protected int beamSize;
+  private Sequence bestSequence;
+  
+  private SequenceClassificationModel<String> model;
+  
+  private LemmatizerContextGenerator contextGenerator;
+  private SequenceValidator<String> sequenceValidator;
+  
+  /**
+   * Initializes the current instance with the provided model
+   * and the default beam size of 3.
+   *
+   * @param model the model
+   */
+  public LemmatizerME(LemmatizerModel model) {
+    
+    LemmatizerFactory factory = model.getFactory();
+    int defaultBeamSize = LemmatizerME.DEFAULT_BEAM_SIZE;
+    String beamSizeString = 
model.getManifestProperty(BeamSearch.BEAM_SIZE_PARAMETER);
+    if (beamSizeString != null) {
+      defaultBeamSize = Integer.parseInt(beamSizeString);
+    }
+    
+    contextGenerator = factory.getContextGenerator();
+    beamSize = defaultBeamSize;
+
+    sequenceValidator = factory.getSequenceValidator();
+
+    if (model.getLemmatizerSequenceModel() != null) {
+      this.model = model.getLemmatizerSequenceModel();
+    }
+    else {
+      this.model = new opennlp.tools.ml.BeamSearch<String>(beamSize,
+          (MaxentModel) model.getLemmatizerSequenceModel(), 0);
+    }
+  }
+  
+public String[] lemmatize(String[] toks, String[] tags) {
+    bestSequence = model.bestSequence(toks, new Object[] {tags}, 
contextGenerator, sequenceValidator);
+    List<String> c = bestSequence.getOutcomes();
+    return c.toArray(new String[c.size()]);
+  }
+  
+  /**
+   * Decodes the lemma from the word and the induced lemma class.
+   * @param toks the array of tokens
+   * @param preds the predicted lemma classes
+   * @return the array of decoded lemmas
+   */
+  public String[] decodeLemmas(String[] toks, String[] preds) {
+    List<String> lemmas = new ArrayList<String>();
+    for (int i = 0; i < toks.length; i++) {
+      String lemma = 
StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]);
+      //System.err.println("-> DEBUG: " + toks[i].toLowerCase() + " " + 
preds[i] + " " + lemma);
+      if (lemma.length() == 0) {
+        lemma = "_";
+      }
+      lemmas.add(lemma);
+    }
+    return lemmas.toArray(new String[lemmas.size()]);
+  }
+  
+  public Sequence[] topKSequences(String[] sentence, String[] tags) {
+    return model.bestSequences(DEFAULT_BEAM_SIZE, sentence,
+        new Object[] { tags }, contextGenerator, sequenceValidator);
+  }
+
+  public Sequence[] topKSequences(String[] sentence, String[] tags, double 
minSequenceScore) {
+    return model.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] { 
tags }, minSequenceScore,
+        contextGenerator, sequenceValidator);
+  }
+
+  /**
+   * Populates the specified array with the probabilities of the last decoded 
sequence.  The
+   * sequence was determined based on the previous call to 
<code>lemmatize</code>.  The
+   * specified array should be at least as large as the number of tokens in 
the previous call to <code>lemmatize</code>.
+   *
+   * @param probs An array used to hold the probabilities of the last decoded 
sequence.
+   */
+  public void probs(double[] probs) {
+    bestSequence.getProbs(probs);
+  }
+
+    /**
+     * Returns an array with the probabilities of the last decoded sequence.  
The
+     * sequence was determined based on the previous call to 
<code>chunk</code>.
+     * @return An array with the same number of probabilities as tokens were 
sent to <code>chunk</code>
+     * when it was last called.
+     */
+  public double[] probs() {
+    return bestSequence.getProbs();
+  }
+  
+  public static LemmatizerModel train(String languageCode,
+      ObjectStream<LemmaSample> samples, TrainingParameters trainParams,
+      LemmatizerFactory posFactory) throws IOException {
+
+    String beamSizeString = 
trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
+
+    int beamSize = LemmatizerME.DEFAULT_BEAM_SIZE;
+    if (beamSizeString != null) {
+      beamSize = Integer.parseInt(beamSizeString);
+    }
+
+    LemmatizerContextGenerator contextGenerator = 
posFactory.getContextGenerator();
+
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+    TrainerType trainerType = 
TrainerFactory.getTrainerType(trainParams.getSettings());
+
+    MaxentModel lemmatizerModel = null;
+    SequenceClassificationModel<String> seqLemmatizerModel = null;
+    if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
+      ObjectStream<Event> es = new LemmaSampleEventStream(samples, 
contextGenerator);
+
+      EventTrainer trainer = 
TrainerFactory.getEventTrainer(trainParams.getSettings(),
+          manifestInfoEntries);
+      lemmatizerModel = trainer.train(es);
+    }
+    else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
+      LemmaSampleSequenceStream ss = new LemmaSampleSequenceStream(samples, 
contextGenerator);
+      EventModelSequenceTrainer trainer = 
TrainerFactory.getEventModelSequenceTrainer(trainParams.getSettings(),
+          manifestInfoEntries);
+      lemmatizerModel = trainer.train(ss);
+    }
+    else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
+      SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
+          trainParams.getSettings(), manifestInfoEntries);
+
+      // TODO: This will probably cause issue, since the feature generator 
uses the outcomes array
+
+      LemmaSampleSequenceStream ss = new LemmaSampleSequenceStream(samples, 
contextGenerator);
+      seqLemmatizerModel = trainer.train(ss);
+    }
+    else {
+      throw new IllegalArgumentException("Trainer type is not supported: " + 
trainerType);
+    }
+
+    if (lemmatizerModel != null) {
+      return new LemmatizerModel(languageCode, lemmatizerModel, beamSize, 
manifestInfoEntries, posFactory);
+    }
+    else {
+      return new LemmatizerModel(languageCode, seqLemmatizerModel, 
manifestInfoEntries, posFactory);
+    }
+  }
+  
+  public Sequence[] topKLemmaClasses(String[] sentence, String[] tags) {
+    return model.bestSequences(DEFAULT_BEAM_SIZE, sentence,
+        new Object[] { tags }, contextGenerator, sequenceValidator);
+  }
+
+  public Sequence[] topKLemmaClasses(String[] sentence, String[] tags, double 
minSequenceScore) {
+    return model.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] { 
tags }, minSequenceScore,
+        contextGenerator, sequenceValidator);
+  }
+}

Added: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java?rev=1731148&view=auto
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
 (added)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
 Thu Feb 18 21:07:48 2016
@@ -0,0 +1,107 @@
+package opennlp.tools.lemmatizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+import java.util.Properties;
+
+import opennlp.tools.ml.BeamSearch;
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+* The {@link LemmatizerModel} is the model used
+* by a learnable {@link Lemmatizer}.
+*
+* @see LemmatizerME
+*/
+public class LemmatizerModel extends BaseModel {
+
+ private static final String COMPONENT_NAME = "StatisticalLemmatizer";
+ private static final String LEMMATIZER_MODEL_ENTRY_NAME = "lemmatizer.model";
+
+ public LemmatizerModel(String languageCode, 
SequenceClassificationModel<String> lemmatizerModel,
+     Map<String, String> manifestInfoEntries, LemmatizerFactory factory) {
+   super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+   artifactMap.put(LEMMATIZER_MODEL_ENTRY_NAME, lemmatizerModel);
+   checkArtifactMap();
+ }
+
+ public LemmatizerModel(String languageCode, MaxentModel lemmatizerModel,
+     Map<String, String> manifestInfoEntries, LemmatizerFactory factory) {
+   this(languageCode, lemmatizerModel, LemmatizerME.DEFAULT_BEAM_SIZE, 
manifestInfoEntries, factory);
+ }
+
+ public LemmatizerModel(String languageCode, MaxentModel lemmatizerModel, int 
beamSize,
+     Map<String, String> manifestInfoEntries, LemmatizerFactory factory) {
+   super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+   artifactMap.put(LEMMATIZER_MODEL_ENTRY_NAME, lemmatizerModel);
+
+   Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
+   manifest.put(BeamSearch.BEAM_SIZE_PARAMETER, Integer.toString(beamSize));
+   checkArtifactMap();
+ }
+
+ public LemmatizerModel(String languageCode, MaxentModel lemmatizerModel, 
LemmatizerFactory factory) {
+   this(languageCode, lemmatizerModel, null, factory);
+ }
+
+ public LemmatizerModel(InputStream in) throws IOException, 
InvalidFormatException {
+   super(COMPONENT_NAME, in);
+ }
+
+ public LemmatizerModel(File modelFile) throws IOException, 
InvalidFormatException {
+   super(COMPONENT_NAME, modelFile);
+ }
+
+ public LemmatizerModel(URL modelURL) throws IOException, 
InvalidFormatException {
+   super(COMPONENT_NAME, modelURL);
+ }
+
+ @Override
+ protected void validateArtifactMap() throws InvalidFormatException {
+   super.validateArtifactMap();
+
+   if (!(artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME) instanceof 
AbstractModel)) {
+     throw new InvalidFormatException("Lemmatizer model is incomplete!");
+   }
+ }
+
+ public SequenceClassificationModel<String> getLemmatizerSequenceModel() {
+
+   Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
+
+   if (artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME) instanceof MaxentModel) {
+     String beamSizeString = 
manifest.getProperty(BeamSearch.BEAM_SIZE_PARAMETER);
+
+     int beamSize = LemmatizerME.DEFAULT_BEAM_SIZE;
+     if (beamSizeString != null) {
+       beamSize = Integer.parseInt(beamSizeString);
+     }
+
+     return new BeamSearch<>(beamSize, (MaxentModel) 
artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME));
+   }
+   else if (artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME) instanceof 
SequenceClassificationModel) {
+     return (SequenceClassificationModel) 
artifactMap.get(LEMMATIZER_MODEL_ENTRY_NAME);
+   }
+   else {
+     return null;
+   }
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+   return LemmatizerFactory.class;
+ }
+
+
+ public LemmatizerFactory getFactory() {
+   return (LemmatizerFactory) this.toolFactory;
+ }
+}

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java?rev=1731148&r1=1731147&r2=1731148&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java 
(original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java 
Thu Feb 18 21:07:48 2016
@@ -232,6 +232,78 @@ public static void computeShortestEditSc
 }
 
 /**
+ * Read predicted SES by the lemmatizer model and apply the
+ * permutations to obtain the lemma from the wordForm.
+ * @param wordForm the wordForm
+ * @param permutations the permutations predicted by the lemmatizer model
+ * @return the lemma
+ */
+public static String decodeShortestEditScript(String wordForm, String 
permutations) {
+  
+  StringBuffer lemma = new StringBuffer(wordForm).reverse();
+  
+  int permIndex = 0;
+  while(true) {
+      if (permutations.length() <= permIndex) {
+        break;
+      }
+      //read first letter of permutation string
+      char nextOperation = permutations.charAt(permIndex);
+      //System.err.println("-> NextOP: " + nextOperation);
+      //go to the next permutation letter
+      permIndex++;
+      if (nextOperation == 'R') {
+          String charAtPerm = 
Character.toString(permutations.charAt(permIndex));
+          int charIndex = Integer.parseInt(charAtPerm);
+          // go to the next character in the permutation buffer
+          // which is the replacement character
+          permIndex++;
+          char replace = permutations.charAt(permIndex);
+          //go to the next char in the permutation buffer
+          // which is the candidate character
+          permIndex++;
+          char with = permutations.charAt(permIndex);
+          
+          if (lemma.length() <= charIndex) {
+            return wordForm; 
+          }
+          if (lemma.charAt(charIndex) == replace) {
+            lemma.setCharAt(charIndex, with);
+          }
+          //System.err.println("-> ROP: " + lemma.toString());
+          //go to next permutation
+          permIndex++;
+          
+      } else if (nextOperation == 'I') {
+          String charAtPerm = 
Character.toString(permutations.charAt(permIndex));
+          int charIndex = Integer.parseInt(charAtPerm);
+          permIndex++;
+          //character to be inserted
+          char in = permutations.charAt(permIndex);
+      
+          if (lemma.length() < charIndex) {
+            return wordForm; 
+          }
+          lemma.insert(charIndex, in);
+          //System.err.println("-> IOP " + lemma.toString());
+          //go to next permutation
+          permIndex++;
+      } else if (nextOperation == 'D') {
+          String charAtPerm = 
Character.toString(permutations.charAt(permIndex));
+          int charIndex = Integer.parseInt(charAtPerm);
+          if (lemma.length() <= charIndex) {
+            return wordForm;
+          }
+          lemma.deleteCharAt(charIndex);
+          permIndex++;
+          // go to next permutation
+          permIndex++;
+      }
+  }
+  return lemma.reverse().toString();
+}
+
+/**
  * Get the SES required to go from a word to a lemma.
  * @param wordForm the word
  * @param lemma the lemma

svn commit: r1731148 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: lemmatizer/LemmatizerME.java lemmatizer/LemmatizerModel.java util/StringUtil.java

Reply via email to