Author: joern
Date: Mon Aug 24 21:28:41 2015
New Revision: 1697504
URL: http://svn.apache.org/r1697504
Log:
OPENNLP-807 We have worked on the integration of the existing approaches.
MFS and IMS now work independently, (will make unit tests).
Mostly, we have formatted the IMS approach to be similar to other tools.
IMS now also saves and loads a model file per word for its training data
instead of 2 separate files (made as artifacts).
Thanks to Anthony Beylerian for providing a patch.
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java
(with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
(with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java
(with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSSurroundingWordsModel.java
(with props)
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/cmdline/disambiguator/DisambiguatorTool.java
Mon Aug 24 21:28:41 2015
@@ -35,7 +35,7 @@ import opennlp.tools.disambiguator.WSDHe
import opennlp.tools.disambiguator.WSDSample;
import opennlp.tools.disambiguator.WSDSampleStream;
import opennlp.tools.disambiguator.WSDisambiguator;
-import opennlp.tools.disambiguator.ims.IMS;
+import opennlp.tools.disambiguator.ims.IMSME;
import opennlp.tools.disambiguator.lesk.Lesk;
import opennlp.tools.disambiguator.mfs.MFS;
import opennlp.tools.util.ObjectStream;
@@ -109,7 +109,6 @@ public class DisambiguatorTool extends C
} else if (params.getType().equalsIgnoreCase("lesk")) {
wsd = new Lesk();
} else if (params.getType().equalsIgnoreCase("ims")) {
- wsd = new IMS();
}
return wsd;
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
Mon Aug 24 21:28:41 2015
@@ -145,12 +145,13 @@ public class FeaturesExtractor {
*/
public ArrayList<String> extractTrainingSurroundingWords(
ArrayList<WTDIMS> trainingData) {
-
+
HashMap<String, Object> words = new HashMap<String, Object>();
-
+
for (WTDIMS word : trainingData) {
for (String sWord : word.getSurroundingWords()) {
- if (!words.containsKey(sWord.toLowerCase()));
+ if (!words.containsKey(sWord.toLowerCase()))
+ ;
words.put(sWord.toLowerCase(), null);
}
}
@@ -158,7 +159,7 @@ public class FeaturesExtractor {
ArrayList<String> list = new ArrayList<String>();
for (String word : words.keySet()) {
- list.add(word);
+ list.add(word);
}
return list;
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
Mon Aug 24 21:28:41 2015
@@ -60,12 +60,12 @@ public class WSDEvaluator extends Evalua
// get the best predicted sense
String predictedSense = disambiguator.disambiguate(reference.getSentence(),
- reference.getTags(),
- reference.getLemmas(), reference.getTargetPosition())[0];
+ reference.getTags(), reference.getLemmas(),
+ reference.getTargetPosition())[0];
if (predictedSense == null) {
- System.out.println("There was no sense for : " +
reference.getTargetWord());
- accuracy.add(0);
+ System.out.println("There was no sense for : "
+ + reference.getTargetWord());
return null;
}
// get the senseKey from the result
@@ -94,8 +94,8 @@ public class WSDEvaluator extends Evalua
}
}
- return new WSDSample(reference.getSentence(), reference.getTags(),
reference.getLemmas(),
- reference.getTargetPosition());
+ return new WSDSample(reference.getSentence(), reference.getTags(),
+ reference.getLemmas(), reference.getTargetPosition());
}
/**
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
Mon Aug 24 21:28:41 2015
@@ -28,16 +28,11 @@ public abstract class WSDParameters {
protected boolean isCoarseSense;
public static boolean isStemCompare;
- public static enum TrainingSource {
- SEMCOR, SEMEVAL, OTHER
- }
-
public static enum SenseSource {
WORDNET, WSDHELPER, OTHER;
}
protected SenseSource senseSource;
- protected TrainingSource trainingSource;
/**
* @return if the disambiguation type is coarse grained or fine grained
@@ -66,14 +61,6 @@ public abstract class WSDParameters {
this.senseSource = senseSource;
}
- public TrainingSource getTrainingSource() {
- return trainingSource;
- }
-
- public void setTrainingSource(TrainingSource trainingSource) {
- this.trainingSource = trainingSource;
- }
-
public WSDParameters() {
this.isCoarseSense = true;
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
Mon Aug 24 21:28:41 2015
@@ -23,6 +23,7 @@ import java.util.Collections;
import java.util.List;
import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Synset;
import net.sf.extjwnl.dictionary.Dictionary;
import opennlp.tools.tokenize.WhitespaceTokenizer;
@@ -63,7 +64,7 @@ public class WSDSample {
;
checkArguments();
}
-
+
public WSDSample(String sentence[], String tags[], String[] lemmas,
int targetPosition, List<String> senseIDs) {
this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
@@ -220,4 +221,26 @@ public class WSDSample {
}
return null;
}
+
+ public String getTargetWordTag() {
+
+ String wordBaseForm = this.getLemmas()[this.getTargetPosition()];
+
+ String ref = "";
+
+ if ((WSDHelper.getPOS(this.getTargetTag()) != null)) {
+ if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.VERB)) {
+ ref = wordBaseForm + ".v";
+ } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.NOUN)) {
+ ref = wordBaseForm + ".n";
+ } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADJECTIVE)) {
+ ref = wordBaseForm + ".a";
+ } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADVERB)) {
+ ref = wordBaseForm + ".r";
+ }
+ }
+
+ return ref;
+ }
+
}
\ No newline at end of file
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSampleStream.java
Mon Aug 24 21:28:41 2015
@@ -29,13 +29,16 @@ import opennlp.tools.util.PlainTextByLin
public class WSDSampleStream extends FilterObjectStream<String, WSDSample> {
- private static Logger logger =
Logger.getLogger(WSDSampleStream.class.getName());
+ private static Logger logger = Logger.getLogger(WSDSampleStream.class
+ .getName());
/**
* Initializes the current instance.
*
- * @param sentences reader with sentences
- * @throws IOException IOException
+ * @param sentences
+ * reader with sentences
+ * @throws IOException
+ * IOException
*/
public WSDSampleStream(Reader sentences) throws IOException {
super(new PlainTextByLineStream(sentences));
@@ -46,12 +49,11 @@ public class WSDSampleStream extends Fil
}
/**
- * Parses the next sentence and return the next
- * {@link WSDSample} object.
+ * Parses the next sentence and return the next {@link WSDSample} object.
*
- * If an error occurs an empty {@link WSDSample} object is returned
- * and an warning message is logged. Usually it does not matter if one
- * of many sentences is ignored.
+ * If an error occurs an empty {@link WSDSample} object is returned and an
+ * warning message is logged. Usually it does not matter if one of many
+ * sentences is ignored.
*
* TODO: An exception in error case should be thrown.
*/
@@ -66,15 +68,15 @@ public class WSDSampleStream extends Fil
} catch (InvalidFormatException e) {
if (logger.isLoggable(Level.WARNING)) {
- logger.warning("Error during parsing, ignoring sentence: " +
sentence);
+ logger
+ .warning("Error during parsing, ignoring sentence: " + sentence);
}
sample = null;// new WSDSample(new String[]{}, new String[]{},0);
}
return sample;
- }
- else {
+ } else {
// sentences stream is exhausted
return null;
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
Mon Aug 24 21:28:41 2015
@@ -50,43 +50,51 @@ import opennlp.tools.util.Span;
*/
public abstract class WSDisambiguator {
+ protected WSDParameters params;
+
/**
* @return the parameters of the disambiguation algorithm
*/
- public abstract WSDParameters getParams();
+ public WSDParameters getParams() {
+ return params;
+ }
/**
* @param the
* disambiguation implementation specific parameters.
* @throws InvalidParameterException
*/
- public abstract void setParams(WSDParameters params) throws
InvalidParameterException;
+ public void setParams(WSDParameters params) throws InvalidParameterException
{
+ this.params = params;
+ }
/**
* @param tokenizedContext
- * @param tokenTags
+ * @param tokenTags
* @param lemmas
* @param ambiguousTokenIndex
* @return result as an array of WordNet IDs
*/
- public abstract String[] disambiguate(String[] tokenizedContext, String[]
tokenTags, String[] lemmas,
- int ambiguousTokenIndex);
+ public abstract String[] disambiguate(String[] tokenizedContext,
+ String[] tokenTags, String[] lemmas, int ambiguousTokenIndex);
- /** The disambiguation method for all the words in a Span
+ /**
+ * The disambiguation method for all the words in a Span
+ *
* @param tokenizedContext
* @param tokenTags
* @param ambiguousTokenIndexSpan
* @param ambiguousTokenLemma
* @return result as an array of WordNet IDs
*/
- public List<String[]> disambiguate(String[] tokenizedContext, String[]
tokenTags, String[] lemmas,
- Span ambiguousTokenIndexSpan){
+ public List<String[]> disambiguate(String[] tokenizedContext,
+ String[] tokenTags, String[] lemmas, Span ambiguousTokenIndexSpan) {
List<String[]> senses = new ArrayList<String[]>();
int start = Math.max(0, ambiguousTokenIndexSpan.getStart());
-
- int end = Math.max(start,Math.min(tokenizedContext.length,
ambiguousTokenIndexSpan.getEnd()));
+ int end = Math.max(start,
+ Math.min(tokenizedContext.length, ambiguousTokenIndexSpan.getEnd()));
for (int i = start; i < end + 1; i++) {
@@ -97,7 +105,7 @@ public abstract class WSDisambiguator {
} else {
if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) {
- String s = WSDParameters.SenseSource.WSDHELPER.name() + " "
+ String s = WSDParameters.SenseSource.WSDHELPER.name() + " "
+ WSDHelper.getNonRelevWordsDef(tokenTags[i]);
String[] sense = { s };
@@ -111,7 +119,7 @@ public abstract class WSDisambiguator {
return senses;
}
-
+
/**
* The disambiguation method for all the words of the context
*
@@ -138,7 +146,8 @@ public abstract class WSDisambiguator {
} else {
if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) {
- String s = IMSParameters.SenseSource.WSDHELPER.name() + " " +
tokenTags[i];
+ String s = IMSParameters.SenseSource.WSDHELPER.name() + " "
+ + tokenTags[i];
String[] sense = { s };
senses.add(sense);
@@ -151,11 +160,11 @@ public abstract class WSDisambiguator {
return senses;
}
-
+
/**
* @param WSDSample
* @return result as an array of WordNet IDs
*/
public abstract String[] disambiguate(WSDSample sample);
-
+
}
\ No newline at end of file
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
Mon Aug 24 21:28:41 2015
@@ -21,9 +21,10 @@ package opennlp.tools.disambiguator.ims;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.List;
+import java.util.HashSet;
-import opennlp.tools.disambiguator.FeaturesExtractor;
+import opennlp.tools.disambiguator.WSDHelper;
+import opennlp.tools.disambiguator.WSDSample;
import opennlp.tools.disambiguator.ims.WTDIMS;
/**
@@ -31,11 +32,81 @@ import opennlp.tools.disambiguator.ims.W
*/
public class DefaultIMSContextGenerator implements IMSContextGenerator {
- FeaturesExtractor fExtractor = new FeaturesExtractor();
-
public DefaultIMSContextGenerator() {
}
+ private String[] extractPosOfSurroundingWords(int index, String[] tags,
+ int windowSize) {
+
+ String[] windowTags = new String[2 * windowSize + 1];
+
+ int j = 0;
+
+ for (int i = index - windowSize; i < index + windowSize; i++) {
+ if (i < 0 || i >= tags.length) {
+ windowTags[j] = "null";
+ } else {
+ windowTags[j] = tags[i].toLowerCase();
+ }
+ j++;
+ }
+
+ return windowTags;
+ }
+
+ public String[] extractSurroundingWords(int index, String[] toks,
+ String[] lemmas) {
+
+ ArrayList<String> contextWords = new ArrayList<String>();
+
+ for (int i = 0; i < toks.length; i++) {
+ if (lemmas != null) {
+ if (!WSDHelper.stopWords.contains(toks[i].toLowerCase())
+ && (index != i)) {
+
+ String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")
+ .trim();
+
+ if (lemma.length() > 1) {
+ contextWords.add(lemma);
+ }
+
+ }
+ }
+ }
+
+ return contextWords.toArray(new String[contextWords.size()]);
+ }
+
+ private String[] extractLocalCollocations(int index, String[] sentence,
+ int ngram) {
+ /**
+ * Here the author used only 11 features of this type. the range was set to
+ * 3 (bigrams extracted in a way that they are at max separated by 1 word).
+ */
+
+ ArrayList<String> localCollocations = new ArrayList<String>();
+
+ for (int i = index - ngram; i <= index + ngram; i++) {
+
+ if (!(i < 0 || i > sentence.length - 2)) {
+ if ((i != index) && (i + 1 != index) && (i + 1 < index + ngram)) {
+ String lc = sentence[i] + " " + sentence[i + 1];
+ localCollocations.add(lc);
+ }
+ if ((i != index) && (i + 2 != index) && (i + 2 < index + ngram)) {
+ String lc = sentence[i] + " " + sentence[i + 2];
+ localCollocations.add(lc);
+ }
+ }
+
+ }
+ String[] res = new String[localCollocations.size()];
+ res = localCollocations.toArray(new String[localCollocations.size()]);
+
+ return res;
+ }
+
/**
* Get Context of a word To disambiguate
*
@@ -44,29 +115,52 @@ public class DefaultIMSContextGenerator
* @return The IMS context of the word to disambiguate
*/
@Override
- public String[] getContext(WTDIMS word) {
- return word.getFeatures();
+ public String[] getContext(int index, String[] toks, String[] tags,
+ String[] lemmas, int ngram, int windowSize, ArrayList<String> model) {
+
+ String[] posOfSurroundingWords = extractPosOfSurroundingWords(index, toks,
+ windowSize);
+
+ HashSet<String> surroundingWords = new HashSet<>();
+ surroundingWords.addAll(Arrays.asList(extractSurroundingWords(index, toks,
+ lemmas)));
+
+ String[] localCollocations = extractLocalCollocations(index, toks, ngram);
+
+ String[] serializedFeatures = new String[posOfSurroundingWords.length
+ + localCollocations.length + model.size()];
+
+ int i = 0;
+
+ for (String feature : posOfSurroundingWords) {
+ serializedFeatures[i] = "F" + i + "=" + feature;
+ i++;
+ }
+
+ for (String feature : localCollocations) {
+ serializedFeatures[i] = "F" + i + "=" + feature;
+ i++;
+ }
+ for (String word : model) {
+
+ if (surroundingWords.contains(word.toString())) {
+ serializedFeatures[i] = "F" + i + "=1";
+ } else {
+ serializedFeatures[i] = "F" + i + "=0";
+ }
+ i++;
+
+ }
+
+ return serializedFeatures;
+
}
- /**
- * This methods gives the list of features for the object of type WTDIMS
- * Extensions of this class can override this method to create a customized
- * {@link IMSContextGenerator}
- *
- * @param word
- * : the word to disambiguate {@link WTDIMS} along with its sentence
- * [Check the Class WTDIMS]
- * @param numberOfSurroundingWords
- * : the number of surrounding words used in the feature
- * "POS Tags of Surrounding Words" Default value is 3
- * @param ngram
- * : the number of words used to extract the feature
- * "Local Collocations" Default value is 2
- *
- * @return an {@link ArrayList} of features
- */
- protected List<String> createContext(WTDIMS word) {
- return Arrays.asList(getContext(word));
+ public String[] getContext(WSDSample sample, int ngram, int windowSize,
+ ArrayList<String> model) {
+
+ return getContext(sample.getTargetPosition(), sample.getSentence(),
+ sample.getTags(), sample.getLemmas(), ngram, windowSize, model);
}
}
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java?rev=1697504&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java
Mon Aug 24 21:28:41 2015
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.ims;
+
+import opennlp.tools.util.SequenceValidator;
+
+public class DefaultIMSSequenceValidator implements SequenceValidator<String> {
+
+ private boolean validOutcome(String outcome, String prevOutcome) {
+ if (outcome.startsWith("I-")) {
+ if (prevOutcome == null) {
+ return (false);
+ } else {
+ if (prevOutcome.equals("O")) {
+ return (false);
+ }
+ if (!prevOutcome.substring(2).equals(outcome.substring(2))) {
+ return (false);
+ }
+ }
+ }
+ return true;
+ }
+
+ protected boolean validOutcome(String outcome, String[] sequence) {
+ String prevOutcome = null;
+ if (sequence.length > 0) {
+ prevOutcome = sequence[sequence.length - 1];
+ }
+ return validOutcome(outcome, prevOutcome);
+ }
+
+ public boolean validSequence(int i, String[] sequence, String[] s,
+ String outcome) {
+ return validOutcome(outcome, s);
+ }
+
+}
\ No newline at end of file
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
Mon Aug 24 21:28:41 2015
@@ -1,504 +1 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import opennlp.tools.ml.maxent.GIS;
-import opennlp.tools.ml.maxent.io.GISModelReader;
-import opennlp.tools.ml.maxent.io.SuffixSensitiveGISModelWriter;
-import opennlp.tools.ml.model.AbstractModel;
-import opennlp.tools.ml.model.AbstractModelWriter;
-import opennlp.tools.ml.model.DataIndexer;
-import opennlp.tools.ml.model.DataReader;
-import opennlp.tools.ml.model.Event;
-import opennlp.tools.ml.model.OnePassDataIndexer;
-import opennlp.tools.ml.model.PlainTextFileDataReader;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.security.InvalidParameterException;
-import java.util.ArrayList;
-import java.util.zip.GZIPInputStream;
-
-import opennlp.tools.ml.model.MaxentModel;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.ObjectStreamUtils;
-import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.disambiguator.FeaturesExtractor;
-import opennlp.tools.disambiguator.WSDHelper;
-import opennlp.tools.disambiguator.WSDParameters;
-import opennlp.tools.disambiguator.WSDSample;
-import opennlp.tools.disambiguator.WSDisambiguator;
-import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
-import opennlp.tools.disambiguator.datareader.SensevalReader;
-import opennlp.tools.disambiguator.mfs.MFS;
-
-/**
- * Implementation of the <b>It Makes Sense</b> approach originally proposed in
- * Senseval-3. The approach relies on the extraction of textual and
- * PoS-tag-based features from the sentences surrounding the word to
- * disambiguate. 3 main families of features are extracted:
- * <ul>
- * <li>PoS-tags of the surrounding words</li>
- * <li>Local collocations</li>
- * <li>Surrounding words</li>
- * </ul>
- * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
- * about this approach
- */
-public class IMS extends WSDisambiguator {
-
- public IMSParameters parameters;
-
- private final IMSContextGenerator cg;
-
- private FeaturesExtractor fExtractor = new FeaturesExtractor();
-
- /**
- * Sets the input parameters to the default ones
- *
- * @throws InvalidParameterException
- */
- public IMS() {
- super();
- // Loader loader = new Loader();
- this.parameters = new IMSParameters();
- this.cg = parameters.createContextGenerator();
- }
-
- /**
- * Initializes the loader object and sets the input parameters
- *
- * @param parameters
- * The parameters to be used
- * @throws InvalidParameterException
- */
- public IMS(IMSParameters parameters) {
- super();
- this.parameters = parameters;
- this.cg = this.parameters.createContextGenerator();
- }
-
- /**
- * Returns that parameter settings of the IMS object.
- *
- * @return the parameter settings
- */
- @Override
- public WSDParameters getParams() {
- return this.parameters;
- }
-
- /**
- * Returns that parameter settings of the IMS object. The returned parameters
- * are of type {@link IMSParameters}
- *
- * @return the parameter settings
- */
- public IMSParameters getParameters() {
- return this.parameters;
- }
-
- /**
- * If the parameters are null, set the default ones. Otherwise, only set them
- * if they valid. Invalid parameters will return a exception (and set the
- * parameters to the default ones)
- *
- * @param Input
- * parameters
- * @throws InvalidParameterException
- */
- @Override
- public void setParams(WSDParameters parameters)
- throws InvalidParameterException {
- if (parameters == null) {
- this.parameters = new IMSParameters();
- } else {
- if (parameters.isValid()) {
- this.parameters = (IMSParameters) parameters;
- } else {
- this.parameters = new IMSParameters();
- throw new InvalidParameterException("wrong parameters");
- }
- }
-
- }
-
- /**
- * If the parameters are null, set the default ones. Otherwise, only set them
- * if they valid. Invalid parameters will return a exception (and set the
- * parameters to the default ones)
- *
- * @param Input
- * parameters
- * @throws InvalidParameterException
- */
- public void setParams(IMSParameters parameters)
- throws InvalidParameterException {
- if (parameters == null) {
- this.parameters = new IMSParameters();
- } else {
- if (parameters.isValid()) {
- this.parameters = parameters;
- } else {
- this.parameters = new IMSParameters();
- throw new InvalidParameterException("wrong parameters");
- }
- }
- }
-
- // Internal Methods
- private ArrayList<String> getAllSurroundingWords(String wordTag) {
-
- ArrayList<String> surrWords = new ArrayList<String>();
-
- BufferedReader br = null;
-
- File file = new File(IMSParameters.trainingDataDirectory + wordTag +
".sw");
-
- if (file.exists()) {
-
- try {
- br = new BufferedReader(new FileReader(file));
-
- String line = br.readLine();
- while (line != null) {
- line = br.readLine();
- if (!surrWords.contains(line)) {
- surrWords.add(line);
- }
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if (br != null) {
- try {
- br.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
-
- return surrWords;
-
- }
-
- private void saveAllSurroundingWords(ArrayList<WTDIMS> trainingInstances,
- String wordTag) {
-
- ArrayList<String> surrWords = fExtractor
- .extractTrainingSurroundingWords(trainingInstances);
-
- File file = new File(IMSParameters.trainingDataDirectory + wordTag +
".sw");
- if (!file.exists()) {
-
- try {
- file.createNewFile();
- } catch (IOException e) {
- System.out
- .println("Unable to create the List of Surrounding Words file !");
- }
- }
-
- try {
- FileWriter fw = new FileWriter(file.getAbsoluteFile());
- BufferedWriter bw = new BufferedWriter(fw);
-
- for (String surrWord : surrWords) {
- bw.write(surrWord);
- bw.newLine();
- }
-
- bw.close();
- } catch (IOException e) {
- System.out
- .println("Unable to create the List of Surrounding Words file !");
- e.printStackTrace();
- }
-
- }
-
- private void extractFeature(WTDIMS word) {
-
- fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(),
- this.parameters.getNgram());
-
- }
-
- /**
- * Method for training a model
- *
- * @param wordTag
- * the word to disambiguate. It should be written in the format
- * "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
- * @param trainParams
- * the parameters used for training
- * @param trainingInstances
- * the training data in the format {@link WTDIMS}
- */
- public void train(String wordTag, TrainingParameters trainParams,
- ArrayList<WTDIMS> trainingInstances) {
-
- String wordTrainingbinFile = IMSParameters.trainingDataDirectory + wordTag
- + ".gz";
-
- ObjectStream<Event> IMSes = null;
-
- for (WTDIMS wtd : trainingInstances) {
- extractFeature(wtd);
- }
-
- saveAllSurroundingWords(trainingInstances, wordTag);
-
- ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
-
- for (WTDIMS wtd : trainingInstances) {
- fExtractor.serializeIMSFeatures(wtd, surrWords);
- }
-
- ArrayList<Event> events = new ArrayList<Event>();
-
- for (WTDIMS wtd : trainingInstances) {
-
- String sense = wtd.getSenseIDs().get(0);
-
- String[] context = cg.getContext(wtd);
-
- Event ev = new Event(sense + "", context);
-
- events.add(ev);
-
- IMSes = ObjectStreamUtils.createObjectStream(events);
-
- }
-
- DataIndexer indexer;
- try {
- indexer = new OnePassDataIndexer((ObjectStream<Event>) IMSes);
- MaxentModel trainedMaxentModel = GIS.trainModel(200, indexer);
- File outFile = new File(wordTrainingbinFile);
- AbstractModelWriter writer = new SuffixSensitiveGISModelWriter(
- (AbstractModel) trainedMaxentModel, outFile);
- writer.persist();
-
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- }
-
- /**
- * Load an existing model
- *
- * @param trainedModel
- * Name of the file of the already trained model
- * @return the model trained
- */
- public MaxentModel load(String trainedModel) {
-
- MaxentModel loadedMaxentModel = null;
-
- FileInputStream inputStream;
- try {
- inputStream = new FileInputStream(trainedModel);
- InputStream decodedInputStream = new GZIPInputStream(inputStream);
- DataReader modelReader = new PlainTextFileDataReader(decodedInputStream);
- loadedMaxentModel = new GISModelReader(modelReader).getModel();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- return loadedMaxentModel;
- }
-
- /**
- * The disambiguation method for a single word, it requires as input one
- * object of type WTDIMS
- *
- * @param inputText
- * : the text containing the word to disambiguate
- * @param inputWordIndex
- * : the index of the word to disambiguate
- */
- public String[] disambiguate(WTDIMS wordToDisambiguate) {
-
- String trainingDataDirectory = IMSParameters.trainingDataDirectory;
-
- File file = new File(trainingDataDirectory);
-
- if (!file.exists()) {
- file.mkdirs();
- }
-
- fExtractor.extractIMSFeatures(wordToDisambiguate,
- this.parameters.getWindowSize(), this.parameters.getNgram());
-
- String wordTag = wordToDisambiguate.getWordTag();
-
- String wordTrainingbinFile = trainingDataDirectory + wordTag + ".gz";
-
- File bf = new File(wordTrainingbinFile);
-
- MaxentModel loadedMaxentModel = null;
- String outcome = "";
-
- if (bf.exists() && !bf.isDirectory()) {
- // If the trained model exists
- ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
- fExtractor.serializeIMSFeatures(wordToDisambiguate, surrWords);
-
- loadedMaxentModel = load(wordTrainingbinFile);
- String[] context = cg.getContext(wordToDisambiguate);
-
- double[] outcomeProbs = loadedMaxentModel.eval(context);
- outcome = loadedMaxentModel.getBestOutcome(outcomeProbs);
-
- } else {
- // Depending on the source, go fetch the training data
- ArrayList<WTDIMS> trainingInstances = new ArrayList<WTDIMS>();
- switch (this.parameters.getTrainingSource()) {
- case SEMCOR: {
- SemcorReaderExtended sReader = new SemcorReaderExtended();
- for (WSDSample ti : sReader.getSemcorData(wordTag)) {
- WTDIMS imsIT = new WTDIMS(ti);
- extractFeature(imsIT);
- trainingInstances.add(imsIT);
- }
- break;
- }
-
- case SEMEVAL: {
- SensevalReader sReader = new SensevalReader();
- for (WSDSample ti : sReader.getSensevalData(wordTag)) {
- WTDIMS imsIT = new WTDIMS(ti);
- extractFeature(imsIT);
- trainingInstances.add(imsIT);
- }
- break;
- }
-
- case OTHER: {
- // TODO check the case when the user selects his own data set (make an
- // interface to collect training data)
- break;
- }
- }
-
- if (!trainingInstances.isEmpty()) {
-
- train(wordTag, null, trainingInstances);
-
- ArrayList<String> surrWords = getAllSurroundingWords(wordTag);
-
- fExtractor.serializeIMSFeatures(wordToDisambiguate, surrWords);
-
- bf = new File(wordTrainingbinFile);
- loadedMaxentModel = load(wordTrainingbinFile);
- String[] context = cg.getContext(wordToDisambiguate);
-
- double[] outcomeProbs = loadedMaxentModel.eval(context);
- outcome = loadedMaxentModel.getBestOutcome(outcomeProbs);
- }
-
- }
-
- if (!outcome.equals("")) {
-
- outcome = parameters.getSenseSource().name() + " "
- + wordTag.split("\\.")[0] + "%" + outcome;
-
- String[] s = { outcome };
-
- return s;
-
- } else {
- // if no training data exist
- MFS mfs = new MFS();
- return mfs.disambiguate(wordTag);
- }
-
- }
-
- @Override
- public String[] disambiguate(WSDSample sample) {
- if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
- WTDIMS wordToDisambiguate = new WTDIMS(sample);
- return disambiguate(wordToDisambiguate);
-
- } else {
- if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
- String s = IMSParameters.SenseSource.WSDHELPER.name() + " "
- + sample.getTargetTag();
- String[] sense = { s };
- return sense;
- } else {
- return null;
- }
- }
-
- }
-
- /**
- * The IMS disambiguation method for a single word
- *
- * @param tokenizedContext
- * : the text containing the word to disambiguate
- * @param tokenTags
- * : the tags corresponding to the context
- * @param lemmas
- * : the lemmas of ALL the words in the context
- * @param index
- * : the index of the word to disambiguate
- * @return an array of the senses of the word to disambiguate
- */
- public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,
- String[] lemmas, int index) {
-
- if (WSDHelper.isRelevantPOSTag(tokenTags[index])) {
- WTDIMS wordToDisambiguate = new WTDIMS(tokenizedContext, tokenTags,
- lemmas, index);
- return disambiguate(wordToDisambiguate);
-
- } else {
- if (WSDHelper.getNonRelevWordsDef(tokenTags[index]) != null) {
- String s = IMSParameters.SenseSource.WSDHELPER.name() + " "
- + tokenTags[index];
- String[] sense = { s };
- return sense;
- } else {
- return null;
- }
- }
-
- }
-
-}
+// TODO to be removed
\ No newline at end of file
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
Mon Aug 24 21:28:41 2015
@@ -19,11 +19,18 @@
package opennlp.tools.disambiguator.ims;
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.WSDSample;
+
/**
* Interface for {@link IMSME} context generators.
*/
public interface IMSContextGenerator {
- public String[] getContext(WTDIMS word);
+ String[] getContext(int index, String[] toks, String[] tags, String[] lemmas,
+ int ngram, int windowSize, ArrayList<String> model);
+ String[] getContext(WSDSample sample, int ngram, int windowSize,
+ ArrayList<String> model);
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSEventStream.java
Mon Aug 24 21:28:41 2015
@@ -1,53 +1 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import opennlp.tools.ml.model.Event;
-import opennlp.tools.util.AbstractEventStream;
-import opennlp.tools.util.ObjectStream;
-
-public class IMSEventStream extends AbstractEventStream<WTDIMS> {
-
- private IMSContextGenerator cg;
-
- public IMSEventStream(ObjectStream<WTDIMS> samples) {
- super(samples);
- }
-
- @Override
- protected Iterator<Event> createEvents(WTDIMS sample) {
- List<Event> events = new ArrayList<Event>();
-
- int sense = sample.getSense();
-
- String[] context = cg.getContext(sample);
-
- Event ev = new Event(sense + "", context);
-
- events.add(ev);
-
- return events.iterator();
- }
-
-}
+// TODO to be removed
\ No newline at end of file
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
Mon Aug 24 21:28:41 2015
@@ -1 +1 @@
-// TODO To be removed
\ No newline at end of file
+/* * Licensed to the Apache Software Foundation (ASF) under one or more *
contributor license agreements. See the NOTICE file distributed with * this
work for additional information regarding copyright ownership. * The ASF
licenses this file to You under the Apache License, Version 2.0 * (the
"License"); you may not use this file except in compliance with * the License.
You may obtain a copy of the License at * *
http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable
law or agreed to in writing, software * distributed under the License is
distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. * See the License for the specific language
governing permissions and * limitations under the License. */package
opennlp.tools.disambiguator.ims;import
opennlp.tools.util.BaseToolFactory;import
opennlp.tools.util.InvalidFormatException;import
opennlp.tools.util.SequenceValidator;import opennlp.tools.util.ext.Extensi
onLoader;public class IMSFactory extends BaseToolFactory { /** * Creates a
{@link IMSFactory} that provides the default implementation of * the
resources. */ public IMSFactory() { } public static IMSFactory
create(String subclassName) throws InvalidFormatException { if
(subclassName == null) { // will create the default factory return
new IMSFactory(); } try { IMSFactory theFactory =
ExtensionLoader.instantiateExtension( IMSFactory.class, subclassName);
return theFactory; } catch (Exception e) { String msg = "Could not
instantiate the " + subclassName + ". The initialization throw an
exception."; System.err.println(msg); e.printStackTrace(); throw
new InvalidFormatException(msg, e); } } @Override public void
validateArtifactMap() throws InvalidFormatException { // no additional
artifacts } public IMSContextGenerator getContextGenerator() { return new
DefaultIMSContextGenerator(); } p
ublic SequenceValidator<String> getSequenceValidator() { return new
DefaultIMSSequenceValidator(); }}
\ No newline at end of file
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java?rev=1697504&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
Mon Aug 24 21:28:41 2015
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.ims;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.Assert;
+import opennlp.tools.disambiguator.WSDHelper;
+import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.mfs.MFS;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.TrainingParameters;
+
+public class IMSME extends WSDisambiguator {
+
+ protected IMSModel imsModel;
+
+ protected static IMSContextGenerator cg = new DefaultIMSContextGenerator();
+
+ public IMSME(IMSParameters params){
+ this.params = params;
+ }
+
+ public IMSME(IMSModel model, IMSParameters params) {
+ this.imsModel = model;
+ this.params = params;
+
+ Assert.assertEquals(model.getWindowSize(),params.getWindowSize());
+ Assert.assertEquals(model.getNgram(),params.getNgram());
+ }
+
+ public void setModel(IMSModel model) {
+ this.imsModel = model;
+ }
+
+ public void setParameters(IMSParameters parameters) {
+ this.params = parameters;
+ }
+
+ public static IMSModel train(String lang, ObjectStream<WSDSample> samples,
+ TrainingParameters mlParams, IMSParameters imsParams,
+ IMSFactory imsfactory) throws IOException {
+
+ ArrayList<String> surroundingWordModel = buildSurroundingWords(samples);
+
+ HashMap<String, String> manifestInfoEntries = new HashMap<String,
String>();
+
+ MaxentModel imsModel = null;
+
+ ArrayList<Event> events = new ArrayList<Event>();
+ ObjectStream<Event> es = null;
+
+ WSDSample sample = samples.read();
+ String wordTag = "";
+ if (sample != null) {
+ wordTag = sample.getTargetWordTag();
+ do {
+
+ String sense = sample.getSenseIDs().get(0);
+
+ String[] context = cg.getContext(sample, imsParams.ngram,
+ imsParams.windowSize, surroundingWordModel);
+ Event ev = new Event(sense + "", context);
+
+ events.add(ev);
+
+ es = ObjectStreamUtils.createObjectStream(events);
+
+ } while ((sample = samples.read()) != null);
+ }
+
+ EventTrainer trainer = TrainerFactory.getEventTrainer(
+ mlParams.getSettings(), manifestInfoEntries);
+ imsModel = trainer.train(es);
+
+ return new IMSModel(lang, wordTag, imsParams.windowSize, imsParams.ngram,
+ imsModel, surroundingWordModel, manifestInfoEntries, imsfactory);
+ }
+
+ public static ArrayList<String> buildSurroundingWords(
+ ObjectStream<WSDSample> samples) throws IOException {
+ DefaultIMSContextGenerator imsCG = new DefaultIMSContextGenerator();
+ ArrayList<String> surroundingWordsModel = new ArrayList<String>();
+ WSDSample sample;
+ while ((sample = samples.read()) != null) {
+ String[] words = imsCG.extractSurroundingWords(
+ sample.getTargetPosition(), sample.getSentence(),
sample.getLemmas());
+
+ if (words.length > 0) {
+ for (String word : words) {
+ surroundingWordsModel.add(word);
+ }
+ }
+ }
+ samples.reset();
+ return surroundingWordsModel;
+ }
+
+ @Override
+ public String[] disambiguate(WSDSample sample) {
+ if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
+ String wordTag = sample.getTargetWordTag();
+
+ String trainingFile = ((IMSParameters) this.getParams())
+ .getTrainingDataDirectory() + sample.getTargetWordTag();
+
+ if (imsModel==null ||
!imsModel.getWordTag().equals(sample.getTargetWordTag())) {
+
+ File file = new File(trainingFile + ".ims.model");
+ if (file.exists() && !file.isDirectory()) {
+ try {
+ setModel(new IMSModel(file));
+
+ } catch (InvalidFormatException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ String outcome = "";
+
+ String[] context = cg.getContext(sample,
+ ((IMSParameters) this.params).ngram,
+ ((IMSParameters) this.params).windowSize,
+ imsModel.getSurroundingWords());
+
+ double[] outcomeProbs = imsModel.getIMSMaxentModel().eval(context);
+ outcome = imsModel.getIMSMaxentModel().getBestOutcome(outcomeProbs);
+
+ if (outcome != null && !outcome.equals("")) {
+
+ outcome = this.getParams().getSenseSource().name() + " "
+ + wordTag.split("\\.")[0] + "%" + outcome;
+
+ String[] s = { outcome };
+
+ return s;
+ } else {
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+
+ } else {
+
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+ } else {
+ String outcome = "";
+
+ String[] context = cg.getContext(sample,
+ ((IMSParameters) this.params).ngram,
+ ((IMSParameters) this.params).windowSize,
+ imsModel.getSurroundingWords());
+
+ double[] outcomeProbs = imsModel.getIMSMaxentModel().eval(context);
+ outcome = imsModel.getIMSMaxentModel().getBestOutcome(outcomeProbs);
+
+ if (outcome != null && !outcome.equals("")) {
+
+ outcome = this.getParams().getSenseSource().name() + " "
+ + wordTag.split("\\.")[0] + "%" + outcome;
+
+ String[] s = { outcome };
+
+ return s;
+ } else {
+
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+ }
+ } else {
+
+ if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
+ String s = IMSParameters.SenseSource.WSDHELPER.name() + " "
+ + sample.getTargetTag();
+ String[] sense = { s };
+ return sense;
+ } else {
+ return null;
+ }
+
+ }
+
+ }
+
+ /**
+ * The IMS disambiguation method for a single word
+ *
+ * @param tokenizedContext
+ * : the text containing the word to disambiguate
+ * @param tokenTags
+ * : the tags corresponding to the context
+ * @param lemmas
+ * : the lemmas of ALL the words in the context
+ * @param index
+ * : the index of the word to disambiguate
+ * @return an array of the senses of the word to disambiguate
+ */
+ public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,
+ String[] lemmas, int index) {
+ return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,
+ index));
+ }
+
+}
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java?rev=1697504&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java
Mon Aug 24 21:28:41 2015
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.ims;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Properties;
+import java.net.URL;
+
+import org.apache.commons.lang3.StringUtils;
+
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+public class IMSModel extends BaseModel {
+
+ private static final String COMPONENT_NAME = "IMSME";
+ private static final String IMS_MODEL_ENTRY_NAME = "IMS.model";
+
+ private static final String WORDTAG = "wordtag";
+ private static final String WINSIZE = "winsize";
+ private static final String NGRAM = "ngram";
+ private static final String SURROUNDINGS = "surroundings";
+
+ private ArrayList<String> surroundingWords = new ArrayList<String>();
+ private String wordTag;
+
+ private int windowSize;
+ private int ngram;
+
+ public ArrayList<String> getSurroundingWords() {
+ return surroundingWords;
+ }
+
+ public int getWindowSize() {
+ return windowSize;
+ }
+
+ public void setWindowSize(int windowSize) {
+ this.windowSize = windowSize;
+ }
+
+ public int getNgram() {
+ return ngram;
+ }
+
+ public void setNgram(int ngram) {
+ this.ngram = ngram;
+ }
+
+ public void setSurroundingWords(ArrayList<String> surroundingWords) {
+ this.surroundingWords = surroundingWords;
+ }
+
+ public String getWordTag() {
+ return wordTag;
+ }
+
+ public void setWordTag(String wordTag) {
+ this.wordTag = wordTag;
+ }
+
+ public IMSModel(String languageCode, String wordTag, int windowSize,
+ int ngram, MaxentModel imsModel, ArrayList<String> surroundingWords,
+ Map<String, String> manifestInfoEntries, IMSFactory factory) {
+ super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+
+ artifactMap.put(IMS_MODEL_ENTRY_NAME, imsModel);
+ this.setManifestProperty(WORDTAG, wordTag);
+ this.setManifestProperty(WINSIZE, windowSize + "");
+ this.setManifestProperty(NGRAM, ngram + "");
+ this.setManifestProperty(SURROUNDINGS,
+ StringUtils.join(surroundingWords, ","));
+
+ this.surroundingWords = surroundingWords;
+ checkArtifactMap();
+ }
+
+ public IMSModel(String languageCode, String wordTag, int windowSize,
+ int ngram, MaxentModel imsModel, ArrayList<String> surroundingWords,
+ IMSFactory factory) {
+ this(languageCode, wordTag, windowSize, ngram, imsModel, surroundingWords,
+ null, factory);
+ }
+
+ public IMSModel(InputStream in) throws IOException, InvalidFormatException {
+ super(COMPONENT_NAME, in);
+ updateAttributes();
+ }
+
+ public IMSModel(File modelFile) throws IOException, InvalidFormatException {
+ super(COMPONENT_NAME, modelFile);
+ updateAttributes();
+ /*
+ * String modelPath = modelFile.getPath(); String surrPath =
+ * modelPath.substring(0, modelPath.length() - 6) + ".surr";
+ *
+ * ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(
+ * new FileInputStream(surrPath))); try {
+ * this.setSurroundingWords((ArrayList<String>) ois.readObject()); } catch
+ * (ClassNotFoundException e) { // TODO Auto-generated catch block
+ * e.printStackTrace(); } finally { ois.close(); }
+ */
+ }
+
+ public IMSModel(URL modelURL) throws IOException, InvalidFormatException {
+ super(COMPONENT_NAME, modelURL);
+ updateAttributes();
+ }
+
+ // path must include the word.tag i.e. : write.v
+ public boolean writeModel(String path) {
+ File outFile = new File(path + ".ims.model");
+ CmdLineUtil.writeModel("ims model", outFile, this);
+ return true;
+ }
+
+ @Override
+ protected void validateArtifactMap() throws InvalidFormatException {
+ super.validateArtifactMap();
+
+ if (!(artifactMap.get(IMS_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+ throw new InvalidFormatException("IMS model is incomplete!");
+ }
+ }
+
+ public MaxentModel getIMSMaxentModel() {
+ if (artifactMap.get(IMS_MODEL_ENTRY_NAME) instanceof MaxentModel) {
+ return (MaxentModel) artifactMap.get(IMS_MODEL_ENTRY_NAME);
+ } else {
+ return null;
+ }
+ }
+
+ public void updateAttributes() {
+ Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
+ String surroundings = (String) manifest.get(SURROUNDINGS);
+
+ this.surroundingWords = new ArrayList(
+ Arrays.asList(surroundings.split(",")));
+ this.wordTag = (String) manifest.get(WORDTAG);
+ this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));
+ this.ngram = Integer.parseInt((String) manifest.get(NGRAM));
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return IMSFactory.class;
+ }
+
+ public IMSFactory getFactory() {
+ return (IMSFactory) this.toolFactory;
+ }
+
+}
\ No newline at end of file
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
Mon Aug 24 21:28:41 2015
@@ -33,9 +33,7 @@ public class IMSParameters extends WSDPa
protected int windowSize;
protected int ngram;
- public static final String resourcesFolder = "src\\test\\resources\\";
- public static final String trainingDataDirectory = resourcesFolder
- + "supervised\\models\\";
+ protected String trainingDataDirectory;
/**
* This constructor takes only two parameters. The default language used is
@@ -50,13 +48,13 @@ public class IMSParameters extends WSDPa
* @param source
* the source of the training data
*/
- public IMSParameters(int windowSize, int ngram,
- TrainingSource trainingSource, SenseSource senseSource) {
+ public IMSParameters(int windowSize, int ngram, SenseSource senseSource,
+ String trainingDataDirectory) {
this.languageCode = "En";
this.windowSize = windowSize;
this.ngram = ngram;
- this.trainingSource = trainingSource;
this.senseSource = senseSource;
+ this.trainingDataDirectory = trainingDataDirectory;
this.isCoarseSense = false;
File folder = new File(trainingDataDirectory);
@@ -64,16 +62,20 @@ public class IMSParameters extends WSDPa
folder.mkdirs();
}
- public IMSParameters() {
- this(3, 2, TrainingSource.SEMCOR, SenseSource.WORDNET);
+ public IMSParameters(String trainingDataDirectory) {
+ this(3, 2, SenseSource.WORDNET, trainingDataDirectory);
+
+ File folder = new File(trainingDataDirectory);
+ if (!folder.exists())
+ folder.mkdirs();
}
- public IMSParameters(TrainingSource source) {
- this(3, 2, source, SenseSource.WORDNET);
+ public IMSParameters() {
+ this(3, 2, SenseSource.WORDNET, null);
}
public IMSParameters(int windowSize, int ngram) {
- this(windowSize, ngram, TrainingSource.SEMCOR, SenseSource.WORDNET);
+ this(windowSize, ngram, SenseSource.WORDNET, null);
}
public String getLanguageCode() {
@@ -111,6 +113,14 @@ public class IMSParameters extends WSDPa
return new DefaultIMSContextGenerator();
}
+ public String getTrainingDataDirectory() {
+ return trainingDataDirectory;
+ }
+
+ public void setTrainingDataDirectory(String trainingDataDirectory) {
+ this.trainingDataDirectory = trainingDataDirectory;
+ }
+
@Override
public boolean isValid() {
// TODO Auto-generated method stub
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSSurroundingWordsModel.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSSurroundingWordsModel.java?rev=1697504&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSSurroundingWordsModel.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSSurroundingWordsModel.java
Mon Aug 24 21:28:41 2015
@@ -0,0 +1 @@
+// TODO to be removed
\ No newline at end of file
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSSurroundingWordsModel.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
Mon Aug 24 21:28:41 2015
@@ -50,7 +50,6 @@ public class MFS extends WSDisambiguator
this.parameters = new MFSParameters();
}
-
/*
* @return the most frequent senses from wordnet
*/
@@ -151,7 +150,7 @@ public class MFS extends WSDisambiguator
public String[] disambiguate(WSDSample sample) {
if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
- return getMostFrequentSenses(sample);
+ return disambiguate(sample.getTargetWordTag());
} else {
if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
@@ -185,7 +184,7 @@ public class MFS extends WSDisambiguator
pos = POS.ADVERB;
} else if (tag.equalsIgnoreCase("n")) {
pos = POS.NOUN;
- } else if (tag.equalsIgnoreCase("a")) {
+ } else if (tag.equalsIgnoreCase("v")) {
pos = POS.VERB;
} else
pos = null;
@@ -209,7 +208,8 @@ public class MFS extends WSDisambiguator
} catch (JWNLException e) {
e.printStackTrace();
}
- senses[i] = senseKey;
+ senses[i] = WSDParameters.SenseSource.WORDNET.name() + " "
+ + senseKey;
break;
}
}
@@ -217,6 +217,7 @@ public class MFS extends WSDisambiguator
}
return senses;
} else {
+ WSDHelper.print(word + " " + pos);
System.out.println("The word has no definitions in WordNet !");
return null;
}
Modified:
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
Mon Aug 24 21:28:41 2015
@@ -19,14 +19,11 @@
package opennlp.tools.disambiguator;
-import java.io.File;
import java.util.ArrayList;
-import java.util.List;
import opennlp.tools.disambiguator.datareader.SensevalReader;
-import opennlp.tools.disambiguator.ims.IMS;
+import opennlp.tools.disambiguator.ims.IMSME;
import opennlp.tools.disambiguator.ims.IMSParameters;
-import opennlp.tools.disambiguator.ims.WTDIMS;
import org.junit.Test;
@@ -36,16 +33,18 @@ public class IMSEvaluatorTest {
@Test
public static void main(String[] args) {
+
+
WSDHelper.print("Evaluation Started");
-
+
+ // TODO write unit test
String modelsDir = "src\\test\\resources\\models\\";
WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
- IMS ims = new IMS();
- IMSParameters imsParams = new IMSParameters();
- ims.setParams(imsParams);
+ IMSParameters imsParams = new IMSParameters("");
+ IMSME ims = new IMSME(imsParams);
ArrayList<String> words = seReader.getSensevalWords();
Modified:
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java?rev=1697504&r1=1697503&r2=1697504&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
Mon Aug 24 21:28:41 2015
@@ -22,7 +22,8 @@ package opennlp.tools.disambiguator;
import java.util.ArrayList;
import java.util.List;
-import opennlp.tools.disambiguator.ims.IMS;
+import opennlp.tools.disambiguator.ims.IMSME;
+import opennlp.tools.disambiguator.ims.IMSParameters;
import opennlp.tools.util.Span;
/**
@@ -38,24 +39,30 @@ import opennlp.tools.util.Span;
public class IMSTester {
public static void main(String[] args) {
+
+ // TODO write unit test
String modelsDir = "src\\test\\resources\\models\\";
- WSDHelper.loadTokenizer(modelsDir+"en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin");
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
- IMS ims = new IMS();
+ IMSParameters params = new IMSParameters("");
-
- /**
- * This is how to make the context for one-word-disambiguation using IMS
- */
+ WSDHelper.print(params.getTrainingDataDirectory());
+
+ IMSME ims = new IMSME(params);
+
+
+ // This is how to make the context for one-word-disambiguation using IMS
+
String test1 = "We need to discuss important topic, please write to me
soon.";
String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
String[] tags1 = WSDHelper.getTagger().tag(sentence1);
List<String> tempLemmas1 = new ArrayList<String>();
for (int i = 0; i < sentence1.length; i++) {
- String lemma = WSDHelper.getLemmatizer().lemmatize(sentence1[i],
tags1[i]);
+ String lemma = WSDHelper.getLemmatizer()
+ .lemmatize(sentence1[i], tags1[i]);
tempLemmas1.add(lemma);
}
String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
@@ -66,17 +73,16 @@ public class IMSTester {
WSDHelper.print(senses1);
WSDHelper.print("*****************************");
+ // This is how to make the context for disambiguation of span of words
- /**
- * This is how to make the context for disambiguation of span of words
- */
String test2 = "The component was highly radioactive to the point that"
+ " it has been activated the second it touched water";
String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
String[] tags2 = WSDHelper.getTagger().tag(sentence2);
List<String> tempLemmas2 = new ArrayList<String>();
for (int i = 0; i < sentence2.length; i++) {
- String lemma = WSDHelper.getLemmatizer().lemmatize(sentence2[i],
tags2[i]);
+ String lemma = WSDHelper.getLemmatizer()
+ .lemmatize(sentence2[i], tags2[i]);
tempLemmas2.add(lemma);
}
String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
@@ -85,7 +91,7 @@ public class IMSTester {
// output
List<String[]> senses2 = ims.disambiguate(sentence2, tags2, lemmas2, span);
for (int i = span.getStart(); i < span.getEnd() + 1; i++) {
- String[] senses = senses2.get(i-span.getStart());
+ String[] senses = senses2.get(i - span.getStart());
System.out.print(lemmas2[i] + " :\t");
WSDHelper.print(senses);
WSDHelper.print("----------");
@@ -93,16 +99,15 @@ public class IMSTester {
WSDHelper.print("*****************************");
+ // This is how to make the context for all-words-disambiguation
- /**
- * This is how to make the context for all-words-disambiguation
- */
String test3 = "The summer almost over and I not to the beach even once";
String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
String[] tags3 = WSDHelper.getTagger().tag(sentence3);
List<String> tempLemmas3 = new ArrayList<String>();
for (int i = 0; i < sentence3.length; i++) {
- String lemma = WSDHelper.getLemmatizer().lemmatize(sentence3[i],
tags3[i]);
+ String lemma = WSDHelper.getLemmatizer()
+ .lemmatize(sentence3[i], tags3[i]);
tempLemmas3.add(lemma);
}
String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);