Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java Thu Aug 20 22:01:59 2015 @@ -1,191 +1 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package opennlp.tools.disambiguator; - -import java.util.ArrayList; -import java.util.Arrays; - -import net.sf.extjwnl.data.POS; - -public class WordToDisambiguate { - - // TODO Check if it is necessary to add an attribute [word] since the word in - // the sentence is not necessarily in the base form ?? - - protected String[] sentence; - protected String[] posTags; - - protected int wordIndex; - - protected int sense; - - protected ArrayList<String> senseIDs; - - public WordToDisambiguate(String[] sentence, int wordIndex) - throws IllegalArgumentException { - super(); - - if (wordIndex > sentence.length) { - throw new IllegalArgumentException("The index is out of bounds !"); - } - - this.sentence = sentence; - this.posTags = WSDHelper.getTagger().tag(sentence); - - this.wordIndex = wordIndex; - - this.sense = -1; - } - - public WordToDisambiguate(String[] sentence, int wordIndex, int sense) - throws IllegalArgumentException { - super(); - - if (wordIndex > sentence.length) { - throw new IllegalArgumentException("The index is out of bounds !"); - } - - this.sentence = sentence; - this.posTags = WSDHelper.getTagger().tag(sentence); - - this.wordIndex = wordIndex; - - this.sense = sense; - } - - public WordToDisambiguate(String[] sentence, int wordIndex, - ArrayList<String> senseIDs) throws IllegalArgumentException { - super(); - - if (wordIndex > sentence.length) { - throw new IllegalArgumentException("The index is out of bounds !"); - } - - this.sentence = sentence; - this.posTags = WSDHelper.getTagger().tag(sentence); - - this.wordIndex = wordIndex; - - this.senseIDs = senseIDs; - } - - public WordToDisambiguate(String[] sentence, String[] tokenTags, int wordIndex) { - this(sentence, wordIndex, -1); - } - - public WordToDisambiguate() { - String[] emptyString = {}; - int emptyInteger = 0; - - this.sentence = emptyString; - this.wordIndex = emptyInteger; - this.sense = -1; - - } - - // Sentence - public String[] getSentence() { - return sentence; - } - - public void setSentence(String[] sentence) { - this.sentence = sentence; - } - - // Sentence Pos-Tags - public String[] getPosTags() { - return posTags; - } - - public void setPosTags(String[] posTags) { - this.posTags = posTags; - } - - // Word to disambiguate - public int getWordIndex() { - return wordIndex; - } - - public String getRawWord() { - - String wordBaseForm = WSDHelper.getLemmatizer().lemmatize( - this.sentence[wordIndex], this.posTags[wordIndex]); - - String ref = ""; - - if ((WSDHelper.getPOS(this.posTags[wordIndex]) != null)) { - if (WSDHelper.getPOS(this.posTags[wordIndex]).equals(POS.VERB)) { - ref = wordBaseForm + ".v"; - } else if (WSDHelper.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) { - ref = wordBaseForm + ".n"; - } else if (WSDHelper.getPOS(this.posTags[wordIndex]) - .equals(POS.ADJECTIVE)) { - ref = wordBaseForm + ".a"; - } else if (WSDHelper.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) { - ref = wordBaseForm + ".r"; - } - - } - - return ref; - } - - public String getWord() { - return this.sentence[this.wordIndex]; - } - - public String getPosTag() { - return this.posTags[this.wordIndex]; - } - - public void setWordIndex(int wordIndex) { - this.wordIndex = wordIndex; - } - - // Word to disambiguate sense - public int getSense() { - return sense; - } - - public void setSense(int sense) { - this.sense = sense; - } - - // Sense as in the source - // TODO fix the conflict between this ID of the sense and that in the - // attribute [sense] - public ArrayList<String> getSenseIDs() { - return senseIDs; - } - - public void setSenseIDs(ArrayList<String> senseIDs) { - this.senseIDs = senseIDs; - } - - public String toString() { - return (wordIndex + "\t" + getWord() + "\n" + sentence); - } - - public void print() { - WSDHelper.print("Sentence: " + Arrays.asList(sentence) + "\n" + "Index: " - + wordIndex + "\n" + "Word: " + getWord() + "\n" + "Sense ID: " - + senseIDs.get(0)); - } -} +// TODO to be removed \ No newline at end of file
Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClusterer.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClusterer.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClusterer.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClusterer.java Thu Aug 20 22:01:59 2015 @@ -20,6 +20,7 @@ package opennlp.tools.disambiguator.contextclustering; import java.security.InvalidParameterException; +import java.util.List; import opennlp.tools.disambiguator.WSDParameters; import opennlp.tools.disambiguator.WSDSample; @@ -32,7 +33,7 @@ import opennlp.tools.util.Span; * * This implementation is based on {@link http://nlp.cs.rpi.edu/paper/wsd.pdf} */ -public class ContextClusterer implements WSDisambiguator { +public class ContextClusterer extends WSDisambiguator { protected ContextClustererParameters params; @@ -56,14 +57,7 @@ public class ContextClusterer implements @Override public String[] disambiguate(String[] tokenizedContext, String[] tokenTags, - int ambiguousTokenIndex, String ambiguousTokenLemma) { - // TODO Auto-generated method stub - return null; - } - - @Override - public String[][] disambiguate(String[] tokenizedContext, String[] tokenTags, - Span ambiguousTokenIndexSpan, String ambiguousTokenLemma) { + String[] lemmas, int ambiguousTokenIndex) { // TODO Auto-generated method stub return null; } @@ -74,10 +68,6 @@ public class ContextClusterer implements return null; } - @Override - public String[] disambiguate(String[] inputText, int inputWordIndex) { - // TODO Auto-generated method stub - return null; - } + } Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java Thu Aug 20 22:01:59 2015 @@ -25,7 +25,8 @@ import java.util.ArrayList; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; -import opennlp.tools.disambiguator.WordToDisambiguate; +import opennlp.tools.disambiguator.WSDHelper; +import opennlp.tools.disambiguator.WSDSample; import org.w3c.dom.Document; import org.w3c.dom.Element; @@ -175,10 +176,10 @@ public class SemcorReaderExtended { return result; } - public ArrayList<WordToDisambiguate> getSemcorOneFileData(String file, + public ArrayList<WSDSample> getSemcorOneFileData(String file, String wordTag) { - ArrayList<WordToDisambiguate> setInstances = new ArrayList<WordToDisambiguate>(); + ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>(); try { @@ -223,8 +224,15 @@ public class SemcorReaderExtended { } if (!senses.isEmpty()) { - WordToDisambiguate wtd = new WordToDisambiguate( - sentence.split("\\s"), index, senses); + String[] words = sentence.split("\\s"); + String[] tags = WSDHelper.getTagger().tag(words); + String[] lemmas = new String[words.length]; + + for (int i = 0; i < words.length; i++) { + lemmas[i] = WSDHelper.getLemmatizer().lemmatize(words[i], tags[i]); + } + + WSDSample wtd = new WSDSample(words, tags, lemmas, index, senses); setInstances.add(wtd); } @@ -253,10 +261,9 @@ public class SemcorReaderExtended { * The word, of which we are looking for the instances * @return the list of the {@link WordToDisambiguate} instances */ - public ArrayList<WordToDisambiguate> getSemcorFolderData(String folder, - String wordTag) { + public ArrayList<WSDSample> getSemcorFolderData(String folder, String wordTag) { - ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>(); + ArrayList<WSDSample> result = new ArrayList<WSDSample>(); String directory = path + folder + tagfiles; File tempFolder = new File(directory); @@ -266,7 +273,7 @@ public class SemcorReaderExtended { listOfFiles = tempFolder.listFiles(); for (File file : listOfFiles) { - ArrayList<WordToDisambiguate> list = getSemcorOneFileData(directory + ArrayList<WSDSample> list = getSemcorOneFileData(directory + file.getName(), wordTag); result.addAll(list); } @@ -285,12 +292,12 @@ public class SemcorReaderExtended { * @return the list of the {@link WordToDisambiguate} instances of the word to * disambiguate */ - public ArrayList<WordToDisambiguate> getSemcorData(String wordTag) { + public ArrayList<WSDSample> getSemcorData(String wordTag) { - ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>(); + ArrayList<WSDSample> result = new ArrayList<WSDSample>(); for (String folder : folders) { - ArrayList<WordToDisambiguate> list = getSemcorFolderData(folder, wordTag); + ArrayList<WSDSample> list = getSemcorFolderData(folder, wordTag); result.addAll(list); } Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java Thu Aug 20 22:01:59 2015 @@ -36,8 +36,8 @@ import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; -import opennlp.tools.disambiguator.WordToDisambiguate; -import opennlp.tools.disambiguator.ims.WTDIMS; +import opennlp.tools.disambiguator.WSDHelper; +import opennlp.tools.disambiguator.WSDSample; /** * This class handles the extraction of Senseval-3 data from the different files @@ -52,19 +52,6 @@ public class SensevalReader { protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap"; protected String wordList = sensevalDirectory + "EnglishLS.train.key"; - // protected String dict = sensevalDirectory + "EnglishLS.dictionary.xml"; - // protected String map = sensevalDirectory + "EnglishLS.sensemap"; - - /** - * The XML file of Senseval presents some issues that need to be fixed first - */ - private String fixXmlFile() { - - // TODO fix this ! - - return null; - } - public SensevalReader() { super(); } @@ -157,9 +144,9 @@ public class SensevalReader { * @return the list of the {@link WordToDisambiguate} instances of the word to * disambiguate */ - public ArrayList<WordToDisambiguate> getSensevalData(String wordTag) { + public ArrayList<WSDSample> getSensevalData(String wordTag) { - ArrayList<WordToDisambiguate> setInstances = new ArrayList<WordToDisambiguate>(); + ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>(); try { @@ -188,28 +175,7 @@ public class SensevalReader { Node nInstance = nInstances.item(j); if (nInstance.getNodeType() == Node.ELEMENT_NODE) { - - Element eInstance = (Element) nInstance; - - String[] wordPos = eLexelt.getAttribute("item").split("\\."); - String word = wordPos[0]; // Word - String tag; // Part of Speech - - if (wordPos[1].equals("n")) { - tag = "noun"; - } else if (wordPos[1].equals("v")) { - tag = "verb"; - } else if (wordPos[1].equals("a")) { - tag = "adjective"; - } else { - tag = "adverb"; - } - - String id = eInstance.getAttribute("id"); - String source = eInstance.getAttribute("docsrc"); - - ArrayList<String> answers = new ArrayList<String>(); - String sentence = ""; + ArrayList<String> senseIDs = new ArrayList<String>(); String rawWord = ""; String[] finalText = null; int index = 0; @@ -227,11 +193,10 @@ public class SensevalReader { String temp = senseid; // String[] temp = { answer, senseid }; - answers.add(temp); + senseIDs.add(temp); } if (nChild.getNodeName().equals("context")) { - sentence = ((Element) nChild).getTextContent(); if (nChild.hasChildNodes()) { String textBefore = nChild.getChildNodes().item(0) @@ -272,9 +237,19 @@ public class SensevalReader { } - WTDIMS wordToDisambiguate = new WTDIMS(finalText, index, - answers); - setInstances.add(wordToDisambiguate); + String[] words = finalText; + String[] tags = WSDHelper.getTagger().tag(words); + String[] lemmas = new String[words.length]; + + for (int k = 0; k < words.length; k++) { + lemmas[k] = WSDHelper.getLemmatizer().lemmatize(words[k], + tags[k]); + } + + WSDSample wtd = new WSDSample(words, tags, lemmas, index, + senseIDs); + setInstances.add(wtd); + } } Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java Thu Aug 20 22:01:59 2015 @@ -46,13 +46,12 @@ import java.util.zip.GZIPInputStream; import opennlp.tools.ml.model.MaxentModel; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.ObjectStreamUtils; -import opennlp.tools.util.Span; import opennlp.tools.util.TrainingParameters; import opennlp.tools.disambiguator.FeaturesExtractor; +import opennlp.tools.disambiguator.WSDHelper; import opennlp.tools.disambiguator.WSDParameters; import opennlp.tools.disambiguator.WSDSample; import opennlp.tools.disambiguator.WSDisambiguator; -import opennlp.tools.disambiguator.WordToDisambiguate; import opennlp.tools.disambiguator.datareader.SemcorReaderExtended; import opennlp.tools.disambiguator.datareader.SensevalReader; import opennlp.tools.disambiguator.mfs.MFS; @@ -70,7 +69,7 @@ import opennlp.tools.disambiguator.mfs.M * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details * about this approach */ -public class IMS implements WSDisambiguator { +public class IMS extends WSDisambiguator { public IMSParameters parameters; @@ -244,8 +243,6 @@ public class IMS implements WSDisambigua e.printStackTrace(); } - System.out.println("Done"); - } private void extractFeature(WTDIMS word) { @@ -344,15 +341,15 @@ public class IMS implements WSDisambigua } /** - * The disambiguation method for a single word + * The disambiguation method for a single word, it requires as input one + * object of type WTDIMS * * @param inputText * : the text containing the word to disambiguate * @param inputWordIndex * : the index of the word to disambiguate */ - @Override - public String[] disambiguate(String[] inputText, int inputWordIndex) { + public String[] disambiguate(WTDIMS wordToDisambiguate) { String trainingDataDirectory = IMSParameters.trainingDataDirectory; @@ -362,11 +359,10 @@ public class IMS implements WSDisambigua file.mkdirs(); } - WTDIMS word = new WTDIMS(inputText, inputWordIndex); - fExtractor.extractIMSFeatures(word, this.parameters.getWindowSize(), - this.parameters.getNgram()); + fExtractor.extractIMSFeatures(wordToDisambiguate, + this.parameters.getWindowSize(), this.parameters.getNgram()); - String wordTag = word.getWordTag(); + String wordTag = wordToDisambiguate.getWordTag(); String wordTrainingbinFile = trainingDataDirectory + wordTag + ".gz"; @@ -378,10 +374,10 @@ public class IMS implements WSDisambigua if (bf.exists() && !bf.isDirectory()) { // If the trained model exists ArrayList<String> surrWords = getAllSurroundingWords(wordTag); - fExtractor.serializeIMSFeatures(word, surrWords); + fExtractor.serializeIMSFeatures(wordToDisambiguate, surrWords); loadedMaxentModel = load(wordTrainingbinFile); - String[] context = cg.getContext(word); + String[] context = cg.getContext(wordToDisambiguate); double[] outcomeProbs = loadedMaxentModel.eval(context); outcome = loadedMaxentModel.getBestOutcome(outcomeProbs); @@ -389,10 +385,10 @@ public class IMS implements WSDisambigua } else { // Depending on the source, go fetch the training data ArrayList<WTDIMS> trainingInstances = new ArrayList<WTDIMS>(); - switch (this.parameters.getSource().code) { - case 1: { + switch (this.parameters.getTrainingSource()) { + case SEMCOR: { SemcorReaderExtended sReader = new SemcorReaderExtended(); - for (WordToDisambiguate ti : sReader.getSemcorData(wordTag)) { + for (WSDSample ti : sReader.getSemcorData(wordTag)) { WTDIMS imsIT = new WTDIMS(ti); extractFeature(imsIT); trainingInstances.add(imsIT); @@ -400,17 +396,17 @@ public class IMS implements WSDisambigua break; } - case 2: { + case SEMEVAL: { SensevalReader sReader = new SensevalReader(); - for (WordToDisambiguate ti : sReader.getSensevalData(wordTag)) { - WTDIMS imsIT = (WTDIMS) ti; + for (WSDSample ti : sReader.getSensevalData(wordTag)) { + WTDIMS imsIT = new WTDIMS(ti); extractFeature(imsIT); trainingInstances.add(imsIT); } break; } - case 3: { + case OTHER: { // TODO check the case when the user selects his own data set (make an // interface to collect training data) break; @@ -423,11 +419,11 @@ public class IMS implements WSDisambigua ArrayList<String> surrWords = getAllSurroundingWords(wordTag); - fExtractor.serializeIMSFeatures(word, surrWords); + fExtractor.serializeIMSFeatures(wordToDisambiguate, surrWords); bf = new File(wordTrainingbinFile); loadedMaxentModel = load(wordTrainingbinFile); - String[] context = cg.getContext(word); + String[] context = cg.getContext(wordToDisambiguate); double[] outcomeProbs = loadedMaxentModel.eval(context); outcome = loadedMaxentModel.getBestOutcome(outcomeProbs); @@ -437,11 +433,8 @@ public class IMS implements WSDisambigua if (!outcome.equals("")) { - // System.out.println("The sense is [" + outcome + "] : " /*+ - // Loader.getDictionary().getWordBySenseKey(outcome.split("%")[1]).getSynset().getGloss()*/); - - outcome = parameters.source.name() + " " + wordTag.split("\\.")[0] + "%" - + outcome; + outcome = parameters.getSenseSource().name() + " " + + wordTag.split("\\.")[0] + "%" + outcome; String[] s = { outcome }; @@ -449,29 +442,63 @@ public class IMS implements WSDisambigua } else { // if no training data exist - return MFS.getMostFrequentSense(word); + MFS mfs = new MFS(); + return mfs.disambiguate(wordTag); } } @Override - public String[] disambiguate(String[] tokenizedContext, String[] tokenTags, - int ambiguousTokenIndex, String ambiguousTokenLemma) { - // TODO Update - return null; - } + public String[] disambiguate(WSDSample sample) { + if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) { + WTDIMS wordToDisambiguate = new WTDIMS(sample); + return disambiguate(wordToDisambiguate); + + } else { + if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) { + String s = IMSParameters.SenseSource.WSDHELPER.name() + " " + + sample.getTargetTag(); + String[] sense = { s }; + return sense; + } else { + return null; + } + } - @Override - public String[][] disambiguate(String[] tokenizedContext, String[] tokenTags, - Span ambiguousTokenIndexSpan, String ambiguousTokenLemma) { - // TODO Update - return null; } - @Override - public String[] disambiguate(WSDSample sample) { - // TODO Update - return null; + /** + * The IMS disambiguation method for a single word + * + * @param tokenizedContext + * : the text containing the word to disambiguate + * @param tokenTags + * : the tags corresponding to the context + * @param lemmas + * : the lemmas of ALL the words in the context + * @param index + * : the index of the word to disambiguate + * @return an array of the senses of the word to disambiguate + */ + public String[] disambiguate(String[] tokenizedContext, String[] tokenTags, + String[] lemmas, int index) { + + if (WSDHelper.isRelevantPOSTag(tokenTags[index])) { + WTDIMS wordToDisambiguate = new WTDIMS(tokenizedContext, tokenTags, + lemmas, index); + return disambiguate(wordToDisambiguate); + + } else { + if (WSDHelper.getNonRelevWordsDef(tokenTags[index]) != null) { + String s = IMSParameters.SenseSource.WSDHELPER.name() + " " + + tokenTags[index]; + String[] sense = { s }; + return sense; + } else { + return null; + } + } + } } Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java Thu Aug 20 22:01:59 2015 @@ -0,0 +1 @@ +// TODO To be removed \ No newline at end of file Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java Thu Aug 20 22:01:59 2015 @@ -29,22 +29,9 @@ import opennlp.tools.disambiguator.WSDPa */ public class IMSParameters extends WSDParameters { - public static enum Source { - SEMCOR(1, "semcor"), SEMEVAL(2, "semeval"), OTHER(3, "other"); - - public int code; - public String src; - - private Source(int code, String src) { - this.code = code; - this.src = src; - } - } - protected String languageCode; protected int windowSize; protected int ngram; - protected Source source; public static final String resourcesFolder = "src\\test\\resources\\"; public static final String trainingDataDirectory = resourcesFolder @@ -63,12 +50,13 @@ public class IMSParameters extends WSDPa * @param source * the source of the training data */ - public IMSParameters(int windowSize, int ngram, Source source) { - super(); + public IMSParameters(int windowSize, int ngram, + TrainingSource trainingSource, SenseSource senseSource) { this.languageCode = "En"; this.windowSize = windowSize; this.ngram = ngram; - this.source = source; + this.trainingSource = trainingSource; + this.senseSource = senseSource; this.isCoarseSense = false; File folder = new File(trainingDataDirectory); @@ -77,15 +65,15 @@ public class IMSParameters extends WSDPa } public IMSParameters() { - this(3, 2, Source.SEMCOR); + this(3, 2, TrainingSource.SEMCOR, SenseSource.WORDNET); } - public IMSParameters(Source source) { - this(3, 2, source); + public IMSParameters(TrainingSource source) { + this(3, 2, source, SenseSource.WORDNET); } public IMSParameters(int windowSize, int ngram) { - this(windowSize, ngram, Source.SEMCOR); + this(windowSize, ngram, TrainingSource.SEMCOR, SenseSource.WORDNET); } public String getLanguageCode() { @@ -112,14 +100,6 @@ public class IMSParameters extends WSDPa this.ngram = ngram; } - public Source getSource() { - return source; - } - - public void setSource(Source source) { - this.source = source; - } - void init() { } Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java Thu Aug 20 22:01:59 2015 @@ -20,39 +20,56 @@ package opennlp.tools.disambiguator.ims; import java.util.ArrayList; +import java.util.List; import net.sf.extjwnl.data.POS; import opennlp.tools.disambiguator.WSDHelper; -import opennlp.tools.disambiguator.WordToDisambiguate; +import opennlp.tools.disambiguator.WSDSample; -public class WTDIMS extends WordToDisambiguate { +public class WTDIMS { + // Attributes related to the context + protected String[] sentence; + protected String[] posTags; + protected String[] lemmas; + protected int wordIndex; + protected int sense; + protected List<String> senseIDs; + + // Attributes related to IMS features protected String[] posOfSurroundingWords; protected String[] surroundingWords; protected String[] localCollocations; - protected String[] features; - public WTDIMS(String[] sentence, int word, int sense) { - super(sentence, word, sense); - + public WTDIMS(String[] sentence, String[] posTags, String[] lemmas, + int wordIndex) { + this.sentence = sentence; + this.posTags = posTags; + this.wordIndex = wordIndex; + this.lemmas = lemmas; } - public WTDIMS(String[] sentence, int word) { - super(sentence, word); + public WTDIMS(String[] sentence, String[] posTags, String[] lemmas, + int wordIndex, List<String> senseIDs) { + this.sentence = sentence; + this.posTags = posTags; + this.wordIndex = wordIndex; + this.lemmas = lemmas; + this.senseIDs = senseIDs; + } - public WTDIMS(String xmlWord, ArrayList<String> senseIDs, String xmlSentence, - String xmlrawWord) { + public WTDIMS(String[] sentence, String[] posTags, String[] lemmas, + String word, List<String> senseIDs) { super(); - // this.word = xmlWord; - - this.sentence = WSDHelper.getTokenizer().tokenize(xmlSentence); - this.posTags = WSDHelper.getTagger().tag(this.sentence); + this.sentence = sentence; + this.posTags = posTags; + this.lemmas = lemmas; for (int i = 0; i < sentence.length; i++) { - if (xmlrawWord.equals(sentence[i])) { + if (word.equals(sentence[i])) { this.wordIndex = i; break; } @@ -62,16 +79,93 @@ public class WTDIMS extends WordToDisamb } - public WTDIMS(WordToDisambiguate wtd) { - super(wtd.getSentence(), wtd.getWordIndex(), wtd.getSense()); - this.senseIDs = wtd.getSenseIDs(); + public WTDIMS(WSDSample sample) { + this.sentence = sample.getSentence(); + this.posTags = sample.getTags(); + this.lemmas = sample.getLemmas(); + this.wordIndex = sample.getTargetPosition(); + this.senseIDs = sample.getSenseIDs(); + + } + + public String[] getSentence() { + return sentence; + } + + public void setSentence(String[] sentence) { + this.sentence = sentence; + } + + public String[] getPosTags() { + return posTags; + } + + public void setPosTags(String[] posTags) { + this.posTags = posTags; + } + + public int getWordIndex() { + return wordIndex; + } + + public void setWordIndex(int wordIndex) { + this.wordIndex = wordIndex; + } + + public String[] getLemmas() { + return lemmas; + } + + public void setLemmas(String[] lemmas) { + this.lemmas = lemmas; } - public WTDIMS(String[] sentence, int wordIndex, ArrayList<String> senseIDs) { - super(sentence, wordIndex); + public int getSense() { + return sense; + } + + public void setSense(int sense) { + this.sense = sense; + } + + public List<String> getSenseIDs() { + return senseIDs; + } + + public void setSenseIDs(ArrayList<String> senseIDs) { this.senseIDs = senseIDs; } + public String getWord() { + return this.getSentence()[this.getWordIndex()]; + } + + public String getWordTag() { + + String wordBaseForm = this.getLemmas()[this.getWordIndex()]; + + String ref = ""; + + if ((WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()]) != null)) { + if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()]).equals( + POS.VERB)) { + ref = wordBaseForm + ".v"; + } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()]) + .equals(POS.NOUN)) { + ref = wordBaseForm + ".n"; + } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()]) + .equals(POS.ADJECTIVE)) { + ref = wordBaseForm + ".a"; + } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()]) + .equals(POS.ADVERB)) { + ref = wordBaseForm + ".r"; + } + } + + return ref; + } + + public String[] getPosOfSurroundingWords() { return posOfSurroundingWords; } @@ -104,25 +198,4 @@ public class WTDIMS extends WordToDisamb this.features = features; } - public String getWordTag() { - - String wordBaseForm = WSDHelper.getLemmatizer().lemmatize(this.getWord(), - this.getPosTag()); - - String ref = ""; - - if ((WSDHelper.getPOS(this.getPosTag()) != null)) { - if (WSDHelper.getPOS(this.getPosTag()).equals(POS.VERB)) { - ref = wordBaseForm + ".v"; - } else if (WSDHelper.getPOS(this.getPosTag()).equals(POS.NOUN)) { - ref = wordBaseForm + ".n"; - } else if (WSDHelper.getPOS(this.getPosTag()).equals(POS.ADJECTIVE)) { - ref = wordBaseForm + ".a"; - } else if (WSDHelper.getPOS(this.getPosTag()).equals(POS.ADVERB)) { - ref = wordBaseForm + ".r"; - } - } - - return ref; - } } Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java Thu Aug 20 22:01:59 2015 @@ -31,7 +31,6 @@ import opennlp.tools.disambiguator.WSDis import opennlp.tools.disambiguator.WordPOS; import opennlp.tools.disambiguator.WordSense; import opennlp.tools.disambiguator.mfs.MFS; -import opennlp.tools.util.Span; import net.sf.extjwnl.JWNLException; import net.sf.extjwnl.data.Synset; import net.sf.extjwnl.data.Word; @@ -44,7 +43,7 @@ import net.sf.extjwnl.data.Word; * the approach are included in this class. * */ -public class Lesk implements WSDisambiguator { +public class Lesk extends WSDisambiguator { /** * The lesk specific parameters @@ -113,8 +112,12 @@ public class Lesk implements WSDisambigu ArrayList<SynNode> nodes = new ArrayList<SynNode>(); for (int i = 0; i < sample.getSentence().length; i++) { - contextWords - .add(new WordPOS(sample.getSentence()[i], sample.getTags()[i])); + if (!WSDHelper.getStopCache().containsKey(sample.getSentence()[i])) { + if (WSDHelper.getRelvCache().containsKey(sample.getTags()[i])) { + contextWords.add(new WordPOS(sample.getSentence()[i], sample + .getTags()[i])); + } + } } for (Synset synset : synsets) { SynNode node = new SynNode(synset, contextWords); @@ -158,8 +161,12 @@ public class Lesk implements WSDisambigu for (int i = index - getParams().win_b_size; i <= index + getParams().win_f_size; i++) { if (i >= 0 && i < sample.getSentence().length && i != index) { - contextWords.add(new WordPOS(sample.getSentence()[i], - sample.getTags()[i])); + if (!WSDHelper.getStopCache().containsKey(sample.getSentence()[i])) { + if (WSDHelper.getRelvCache().containsKey(sample.getTags()[i])) { + contextWords.add(new WordPOS(sample.getSentence()[i], sample + .getTags()[i])); + } + } } } @@ -944,44 +951,18 @@ public class Lesk implements WSDisambigu return count; } - /** - * Disambiguates an ambiguous word in its context - * - * @param tokenizedContext - * @param ambiguousTokenIndex - * @return array of sense indexes from WordNet ordered by their score. The - * result format is <b>Source</b> <b>SenseID</b> If the input token is - * non relevant a null is returned. - */ - @Override - public String[] disambiguate(String[] tokenizedContext, String[] tokenTags, - int ambiguousTokenIndex, String ambiguousTokenLemma) { - return disambiguate(new WSDSample(tokenizedContext, tokenTags, - ambiguousTokenIndex, ambiguousTokenLemma)); - } - - /** - * Disambiguates an ambiguous word in its context The user can set a span of - * inputWords from the tokenized input - * - * @param inputText - * @param inputWordSpans - * @return array of array of sense indexes from WordNet ordered by their - * score. The result format is <b>Source</b> <b>SenseID</b> If the - * input token is non relevant a null is returned. - */ - @Override - public String[][] disambiguate(String[] tokenizedContext, String[] tokenTags, - Span ambiguousTokenSpan, String ambiguousTokenLemma) { - // TODO need to work on spans - return null; - } - @Override public String[] disambiguate(WSDSample sample) { - // if the word is not relevant return null - if (!WSDHelper.isRelevant(sample.getTargetTag())) { - return null; + // if not relevant POS tag + if (!WSDHelper.isRelevantPOSTag(sample.getTargetTag())) { + if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) { + String s = WSDParameters.SenseSource.WSDHELPER.name() + " " + + sample.getTargetTag(); + String[] sense = { s }; + return sense; + } else { + return null; + } } ArrayList<WordSense> wsenses = null; @@ -1020,7 +1001,8 @@ public class Lesk implements WSDisambigu for (int i = 0; i < wsenses.size(); i++) { synsetWords = wsenses.get(i).getNode().synset.getWords(); for (Word synWord : synsetWords) { - if (synWord.getLemma().equals(sample.getTargetLemma())) { + if (synWord.getLemma().equals( + sample.getLemmas()[sample.getTargetPosition()])) { try { senseKey = synWord.getSenseKey(); } catch (JWNLException e) { @@ -1041,9 +1023,10 @@ public class Lesk implements WSDisambigu } @Override - public String[] disambiguate(String[] inputText, int inputWordIndex) { - // TODO Deprecate - return null; + public String[] disambiguate(String[] tokenizedContext, String[] tokenTags, + String[] lemmas, int ambiguousTokenIndex) { + return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas, + ambiguousTokenIndex)); } } Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java Thu Aug 20 22:01:59 2015 @@ -37,7 +37,7 @@ public class LeskParameters extends WSDP // DEFAULTS protected static final LESK_TYPE DFLT_LESK_TYPE = LESK_TYPE.LESK_EXT_EXP_CTXT; - protected static final Source DFLT_SOURCE = Source.WORDNET; + protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET; protected static final int DFLT_WIN_SIZE = 10; protected static final int DFLT_DEPTH = 1; protected static final double DFLT_DEPTH_WEIGHT = 0.8; @@ -46,7 +46,7 @@ public class LeskParameters extends WSDP protected LESK_TYPE leskType; - protected Source source; + protected SenseSource source; protected int win_f_size; protected int win_b_size; protected int depth; Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java Thu Aug 20 22:01:59 2015 @@ -32,15 +32,13 @@ import opennlp.tools.disambiguator.WSDPa import opennlp.tools.disambiguator.WSDSample; import opennlp.tools.disambiguator.WSDisambiguator; import opennlp.tools.disambiguator.WordPOS; -import opennlp.tools.disambiguator.WordToDisambiguate; -import opennlp.tools.util.Span; /** * Implementation of the <b>Most Frequent Sense</b> baseline approach. This * approach returns the senses in order of frequency in WordNet. The first sense * is the most frequent. */ -public class MFS implements WSDisambiguator { +public class MFS extends WSDisambiguator { public MFSParameters parameters; @@ -52,47 +50,7 @@ public class MFS implements WSDisambigua this.parameters = new MFSParameters(); } - @Deprecated - public static String[] getMostFrequentSense( - WordToDisambiguate wordToDisambiguate) { - - String word = wordToDisambiguate.getRawWord().toLowerCase(); - POS pos = WSDHelper.getPOS(wordToDisambiguate.getPosTag()); - - if (pos != null) { - - WordPOS wordPOS = new WordPOS(word, pos); - - ArrayList<Synset> synsets = wordPOS.getSynsets(); - - int size = synsets.size(); - - String[] senses = new String[size]; - - for (int i = 0; i < size; i++) { - String senseKey = null; - for (Word wd : synsets.get(i).getWords()) { - if (wd.getLemma().equals( - wordToDisambiguate.getRawWord().split("\\.")[0])) { - try { - senseKey = wd.getSenseKey(); - } catch (JWNLException e) { - e.printStackTrace(); - } - senses[i] = "WordNet " + senseKey; - break; - } - } - - } - return senses; - } else { - System.out.println("The word has no definitions in WordNet !"); - return null; - } - - } - + /* * @return the most frequent senses from wordnet */ @@ -102,19 +60,23 @@ public class MFS implements WSDisambigua for (Word wd : synsets.get(0).getWords()) { if (WSDParameters.isStemCompare) { WordPOS wdPOS = new WordPOS(wd.getLemma(), wd.getPOS()); - WordPOS samplePOS = new WordPOS(sample.getTargetLemma(), + WordPOS samplePOS = new WordPOS( + sample.getLemmas()[sample.getTargetPosition()], WSDHelper.getPOS(sample.getTargetTag())); if (wdPOS.isStemEquivalent(samplePOS)) { try { - return WSDParameters.Source.WORDNET.name() + " " + wd.getSenseKey(); + return WSDParameters.SenseSource.WORDNET.name() + " " + + wd.getSenseKey(); } catch (JWNLException e) { e.printStackTrace(); } } } else { - if (wd.getLemma().equalsIgnoreCase((sample.getTargetLemma()))) { + if (wd.getLemma().equalsIgnoreCase( + (sample.getLemmas()[sample.getTargetPosition()]))) { try { - return WSDParameters.Source.WORDNET.name() + " " + wd.getSenseKey(); + return WSDParameters.SenseSource.WORDNET.name() + " " + + wd.getSenseKey(); } catch (JWNLException e) { e.printStackTrace(); } @@ -134,11 +96,12 @@ public class MFS implements WSDisambigua for (Word wd : synsets.get(i).getWords()) { if (WSDParameters.isStemCompare) { WordPOS wdPOS = new WordPOS(wd.getLemma(), wd.getPOS()); - WordPOS samplePOS = new WordPOS(sample.getTargetLemma(), + WordPOS samplePOS = new WordPOS( + sample.getLemmas()[sample.getTargetPosition()], WSDHelper.getPOS(sample.getTargetTag())); if (wdPOS.isStemEquivalent(samplePOS)) { try { - senseKeys[i] = WSDParameters.Source.WORDNET.name() + " " + senseKeys[i] = WSDParameters.SenseSource.WORDNET.name() + " " + wd.getSenseKey(); break; } catch (JWNLException e) { @@ -147,9 +110,10 @@ public class MFS implements WSDisambigua break; } } else { - if (wd.getLemma().equalsIgnoreCase((sample.getTargetLemma()))) { + if (wd.getLemma().equalsIgnoreCase( + (sample.getLemmas()[sample.getTargetPosition()]))) { try { - senseKeys[i] = WSDParameters.Source.WORDNET.name() + " " + senseKeys[i] = WSDParameters.SenseSource.WORDNET.name() + " " + wd.getSenseKey(); break; } catch (JWNLException e) { @@ -185,27 +149,77 @@ public class MFS implements WSDisambigua @Override public String[] disambiguate(WSDSample sample) { - return getMostFrequentSenses(sample); + + if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) { + return getMostFrequentSenses(sample); + + } else { + if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) { + String s = WSDParameters.SenseSource.WSDHELPER.name() + " " + + sample.getTargetTag(); + String[] sense = { s }; + return sense; + } else { + return null; + } + } } @Override public String[] disambiguate(String[] tokenizedContext, String[] tokenTags, - int ambiguousTokenIndex, String lemma) { - return disambiguate(new WSDSample(tokenizedContext, tokenTags, - ambiguousTokenIndex, lemma)); - } + String[] lemmas, int ambiguousTokenIndex) { + return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas, + ambiguousTokenIndex)); + } + + public String[] disambiguate(String wordTag) { + + String word = wordTag.split("\\.")[0]; + String tag = wordTag.split("\\.")[1]; + + POS pos; + + if (tag.equalsIgnoreCase("a")) { + pos = POS.ADJECTIVE; + } else if (tag.equalsIgnoreCase("r")) { + pos = POS.ADVERB; + } else if (tag.equalsIgnoreCase("n")) { + pos = POS.NOUN; + } else if (tag.equalsIgnoreCase("a")) { + pos = POS.VERB; + } else + pos = null; - @Override - public String[][] disambiguate(String[] tokenizedContext, String[] tokenTags, - Span ambiguousTokenIndexSpan, String ambiguousTokenLemma) { - // TODO A iterate over span - return null; - } + if (pos != null) { - @Override - public String[] disambiguate(String[] inputText, int inputWordIndex) { - // TODO Deprecate - return null; - } + WordPOS wordPOS = new WordPOS(word, pos); + + ArrayList<Synset> synsets = wordPOS.getSynsets(); + + int size = synsets.size(); + + String[] senses = new String[size]; + + for (int i = 0; i < size; i++) { + String senseKey = null; + for (Word wd : synsets.get(i).getWords()) { + if (wd.getLemma().equals(word)) { + try { + senseKey = wd.getSenseKey(); + } catch (JWNLException e) { + e.printStackTrace(); + } + senses[i] = senseKey; + break; + } + } + } + return senses; + } else { + System.out.println("The word has no definitions in WordNet !"); + return null; + } + + } } Modified: opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java (original) +++ opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java Thu Aug 20 22:01:59 2015 @@ -27,22 +27,22 @@ public class MFSParameters extends WSDPa public MFSParameters() { this.isCoarseSense = false; - this.source = Source.WORDNET; + this.source = SenseSource.WORDNET; } - protected Source source; + protected SenseSource source; - public Source getSource() { + public SenseSource getSource() { return source; } - public void setSource(Source source) { + public void setSource(SenseSource source) { this.source = source; } @Override public boolean isValid() { - return EnumUtils.isValidEnum(Source.class, source.name()); + return EnumUtils.isValidEnum(SenseSource.class, source.name()); } } Modified: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java (original) +++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java Thu Aug 20 22:01:59 2015 @@ -39,11 +39,10 @@ public class IMSEvaluatorTest { WSDHelper.print("Evaluation Started"); String modelsDir = "src\\test\\resources\\models\\"; - WSDHelper.loadTokenizer(modelsDir+"en-token.bin"); - WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict"); - WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin"); - - + WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); + WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); + WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); + IMS ims = new IMS(); IMSParameters imsParams = new IMSParameters(); ims.setParams(imsParams); @@ -56,7 +55,7 @@ public class IMSEvaluatorTest { // don't take verbs because they are not from WordNet if (!word.split("\\.")[1].equals("v")) { - ArrayList<WSDSample> instances = getTestData(word); + ArrayList<WSDSample> instances = seReader.getSensevalData(word); if (instances != null) { WSDHelper.print("------------------" + word + "------------------"); for (WSDSample instance : instances) { @@ -74,59 +73,4 @@ public class IMSEvaluatorTest { } } - - /** - * For a specific word, return the Semeval3 corresponding instances in form of - * {@link WSDIMS} - * - * @param wordTag - * the word of which the instances are to be collected. wordTag has - * to be in the format "word.POS" (e.g., "activate.v", "smart.a", - * etc.) - * @return list of {@link WSDIMS} instances of the wordTag - */ - @Deprecated - protected static ArrayList<WTDIMS> getTestDataOld(String wordTag) { - - ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>(); - for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) { - WTDIMS wtdims = new WTDIMS(wtd); - instances.add(wtdims); - } - - return instances; - } - - protected static ArrayList<WSDSample> getTestData(String wordTag) { - - ArrayList<WSDSample> instances = new ArrayList<WSDSample>(); - for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) { - List<WordPOS> words = WSDHelper.getAllRelevantWords(wtd); - int targetWordIndex=0; - for (int i=0; i<words.size();i++){ - if(words.get(i).isTarget){ - targetWordIndex = i; - } - } - String[] tags = new String[words.size()]; - String[] tokens = new String[words.size()]; - for (int i=0;i<words.size();i++){ - tags[i] = words.get(i).getPosTag(); - tokens[i] = words.get(i).getWord(); - } - String targetLemma = WSDHelper.getLemmatizer().lemmatize( - tokens[targetWordIndex], tags[targetWordIndex]); - - WSDSample sample = new WSDSample(tokens,tags,targetWordIndex,targetLemma); - sample.setSenseIDs(wtd.getSenseIDs()); - if (sample != null) { - if (sample.getSenseIDs().get(0) != null - && !sample.getSenseIDs().get(0).equalsIgnoreCase("U")) { - instances.add(sample); - } - } - } - return instances; - } - } Modified: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java (original) +++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java Thu Aug 20 22:01:59 2015 @@ -19,7 +19,11 @@ package opennlp.tools.disambiguator; +import java.util.ArrayList; +import java.util.List; + import opennlp.tools.disambiguator.ims.IMS; +import opennlp.tools.util.Span; /** * This is a typical example of how to call the disambiguation function in the @@ -34,26 +38,83 @@ import opennlp.tools.disambiguator.ims.I public class IMSTester { public static void main(String[] args) { - + String modelsDir = "src\\test\\resources\\models\\"; WSDHelper.loadTokenizer(modelsDir+"en-token.bin"); WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict"); WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin"); - + IMS ims = new IMS(); - String test1 = "Please write to me soon."; + + /** + * This is how to make the context for one-word-disambiguation using IMS + */ + String test1 = "We need to discuss important topic, please write to me soon."; String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1); - WSDHelper.print(ims.disambiguate(sentence1, 1)); + String[] tags1 = WSDHelper.getTagger().tag(sentence1); + List<String> tempLemmas1 = new ArrayList<String>(); + for (int i = 0; i < sentence1.length; i++) { + String lemma = WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]); + tempLemmas1.add(lemma); + } + String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]); + + // output + String[] senses1 = ims.disambiguate(sentence1, tags1, lemmas1, 8); + System.out.print(lemmas1[8] + " :\t"); + WSDHelper.print(senses1); + WSDHelper.print("*****************************"); - String test2 = "it was a strong argument that his hypothesis was true"; + + /** + * This is how to make the context for disambiguation of span of words + */ + String test2 = "The component was highly radioactive to the point that" + + " it has been activated the second it touched water"; String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2); - WSDHelper.print(ims.disambiguate(sentence2, 3)); + String[] tags2 = WSDHelper.getTagger().tag(sentence2); + List<String> tempLemmas2 = new ArrayList<String>(); + for (int i = 0; i < sentence2.length; i++) { + String lemma = WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]); + tempLemmas2.add(lemma); + } + String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]); + Span span = new Span(3, 7); + + // output + List<String[]> senses2 = ims.disambiguate(sentence2, tags2, lemmas2, span); + for (int i = span.getStart(); i < span.getEnd() + 1; i++) { + String[] senses = senses2.get(i-span.getStart()); + System.out.print(lemmas2[i] + " :\t"); + WSDHelper.print(senses); + WSDHelper.print("----------"); + } - String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water"; - String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); - WSDHelper.print(ims.disambiguate(sentence3, 12)); + WSDHelper.print("*****************************"); + + /** + * This is how to make the context for all-words-disambiguation + */ + String test3 = "The summer almost over and I not to the beach even once"; + String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); + String[] tags3 = WSDHelper.getTagger().tag(sentence3); + List<String> tempLemmas3 = new ArrayList<String>(); + for (int i = 0; i < sentence3.length; i++) { + String lemma = WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]); + tempLemmas3.add(lemma); + } + String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); + + // output + List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3); + for (int i = 0; i < sentence3.length; i++) { + String[] senses = senses3.get(i); + System.out.print(lemmas3[i] + " :\t"); + WSDHelper.print(senses); + WSDHelper.print("----------"); + } } } Modified: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java (original) +++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java Thu Aug 20 22:01:59 2015 @@ -20,10 +20,8 @@ package opennlp.tools.disambiguator; import java.util.ArrayList; -import java.util.List; import opennlp.tools.disambiguator.datareader.SensevalReader; -import opennlp.tools.disambiguator.ims.WTDIMS; import opennlp.tools.disambiguator.lesk.Lesk; import opennlp.tools.disambiguator.lesk.LeskParameters; @@ -56,7 +54,7 @@ public class LeskEvaluatorTest { // don't take verbs because they are not from WordNet if (!word.split("\\.")[1].equals("v")) { - ArrayList<WSDSample> instances = getTestData(word); + ArrayList<WSDSample> instances = seReader.getSensevalData(word); if (instances != null) { WSDHelper.print("------------------" + word + "------------------"); for (WSDSample instance : instances) { @@ -73,37 +71,5 @@ public class LeskEvaluatorTest { } } - protected static ArrayList<WSDSample> getTestData(String wordTag) { - - ArrayList<WSDSample> instances = new ArrayList<WSDSample>(); - for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) { - List<WordPOS> words = WSDHelper.getAllRelevantWords(wtd); - int targetWordIndex = 0; - for (int i = 0; i < words.size(); i++) { - if (words.get(i).isTarget) { - targetWordIndex = i; - } - } - String[] tags = new String[words.size()]; - String[] tokens = new String[words.size()]; - for (int i = 0; i < words.size(); i++) { - tags[i] = words.get(i).getPosTag(); - tokens[i] = words.get(i).getWord(); - } - String targetLemma = WSDHelper.getLemmatizer().lemmatize( - tokens[targetWordIndex], tags[targetWordIndex]); - - WSDSample sample = new WSDSample(tokens, tags, targetWordIndex, - targetLemma); - sample.setSenseIDs(wtd.getSenseIDs()); - if (sample != null) { - if (sample.getSenseIDs().get(0) != null - && !sample.getSenseIDs().get(0).equalsIgnoreCase("U")) { - instances.add(sample); - } - } - } - return instances; - } } Modified: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java (original) +++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java Thu Aug 20 22:01:59 2015 @@ -19,7 +19,7 @@ package opennlp.tools.disambiguator; - +import java.util.ArrayList; import java.util.List; import opennlp.tools.disambiguator.lesk.Lesk; @@ -32,7 +32,6 @@ public class LeskTester { @Test public static void main(String[] args) { - Lesk lesk = new Lesk(); LeskParameters params = new LeskParameters(); params.setLeskType(LESK_TYPE.LESK_EXT); @@ -40,72 +39,60 @@ public class LeskTester { params.setFeatures(a); lesk.setParams(params); String modelsDir = "src\\test\\resources\\models\\"; - WSDHelper.loadTokenizer(modelsDir+"en-token.bin"); - WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict"); - WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin"); - + WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); + WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); + WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); + String test1 = "I went to the bank to deposit money."; - String[] sentence = WSDHelper.getTokenizer().tokenize(test1); - List<WordPOS> words = WSDHelper.getAllRelevantWords(sentence); - int targetWordIndex = 0; - String[] tags = new String[words.size()]; - String[] tokens = new String[words.size()]; - for (int i=0;i<words.size();i++){ - tags[i] = words.get(i).getPosTag(); - tokens[i] = words.get(i).getWord(); - - WSDHelper.print("token : "+ tokens[i] + "_" + tags[i]); + String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1); + int targetWordIndex1 = 5; + String[] tags1 = WSDHelper.getTagger().tag(sentence1); + List<String> tempLemmas1 = new ArrayList<String>(); + for (int i = 0; i < sentence1.length; i++) { + String lemma = WSDHelper.getLemmatizer() + .lemmatize(sentence1[i], tags1[i]); + tempLemmas1.add(lemma); } - String targetLemma = WSDHelper.getLemmatizer().lemmatize( - tokens[targetWordIndex], tags[targetWordIndex]); - // Constants.print("lemma : "+ targetLemma); - WSDHelper.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma)); - WSDHelper.printResults(lesk, - lesk.disambiguate(tokens, tags, targetWordIndex, targetLemma)); - + String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]); + String[] results1 = lesk.disambiguate(sentence1, tags1, lemmas1, + targetWordIndex1); + WSDHelper.print(results1); + WSDHelper.printResults(lesk, results1); + WSDHelper.print("----------------------------------------"); - + String test2 = "it was a strong argument that his hypothesis was true"; - sentence = WSDHelper.getTokenizer().tokenize(test2); - words = WSDHelper.getAllRelevantWords(sentence); - targetWordIndex = 1; - tags = new String[words.size()]; - tokens = new String[words.size()]; - for (int i=0;i<words.size();i++){ - tags[i] = words.get(i).getPosTag(); - tokens[i] = words.get(i).getWord(); - - //Constants.print("token : "+ tokens[i] + "_" + tags[i]); + String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2); + int targetWordIndex2 = 4; + String[] tags2 = WSDHelper.getTagger().tag(sentence2); + List<String> tempLemmas2 = new ArrayList<String>(); + for (int i = 0; i < sentence1.length; i++) { + String lemma = WSDHelper.getLemmatizer() + .lemmatize(sentence2[i], tags2[i]); + tempLemmas2.add(lemma); } - targetLemma = WSDHelper.getLemmatizer().lemmatize( - tokens[targetWordIndex], tags[targetWordIndex]); - //Constants.print("lemma : "+ targetLemma); - - WSDHelper.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma)); - WSDHelper.printResults(lesk, - lesk.disambiguate(tokens, tags, targetWordIndex, targetLemma)); + String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]); + String[] results2 = lesk.disambiguate(sentence2, tags2, lemmas2, + targetWordIndex2); + WSDHelper.print(results2); + WSDHelper.printResults(lesk, results2); WSDHelper.print("----------------------------------------"); - + String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water"; - - sentence = WSDHelper.getTokenizer().tokenize(test3); - words = WSDHelper.getAllRelevantWords(sentence); - targetWordIndex = 4; - tags = new String[words.size()]; - tokens = new String[words.size()]; - for (int i=0;i<words.size();i++){ - tags[i] = words.get(i).getPosTag(); - tokens[i] = words.get(i).getWord(); - - //Constants.print("token : "+ tokens[i] + "_" + tags[i]); + String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); + int targetWordIndex3 = 3; + String[] tags3 = WSDHelper.getTagger().tag(sentence3); + List<String> tempLemmas3 = new ArrayList<String>(); + for (int i = 0; i < sentence3.length; i++) { + String lemma = WSDHelper.getLemmatizer() + .lemmatize(sentence3[i], tags3[i]); + tempLemmas3.add(lemma); } - targetLemma = WSDHelper.getLemmatizer().lemmatize( - tokens[targetWordIndex], tags[targetWordIndex]); - //Constants.print("lemma : "+ targetLemma); - - WSDHelper.print(lesk.disambiguate(tokens, tags, targetWordIndex,targetLemma)); - WSDHelper.printResults(lesk, - lesk.disambiguate(tokens, tags, targetWordIndex, targetLemma)); + String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); + String[] results3 = lesk.disambiguate(sentence3, tags3, lemmas3, + targetWordIndex3); + WSDHelper.print(results3); + WSDHelper.printResults(lesk, results3); WSDHelper.print("----------------------------------------"); } Modified: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java (original) +++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java Thu Aug 20 22:01:59 2015 @@ -36,9 +36,9 @@ public class MFSEvaluatorTest { public static void main(String[] args) { WSDHelper.print("Evaluation Started"); String modelsDir = "src\\test\\resources\\models\\"; - WSDHelper.loadTokenizer(modelsDir+"en-token.bin"); - WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict"); - WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin"); + WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); + WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); + WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); MFS mfs = new MFS(); WSDParameters.isStemCompare = true; @@ -50,7 +50,7 @@ public class MFSEvaluatorTest { // don't take verbs because they are not from WordNet if (!word.split("\\.")[1].equals("v")) { - ArrayList<WSDSample> instances = getTestData(word); + ArrayList<WSDSample> instances = seReader.getSensevalData(word); if (instances != null) { WSDHelper.print("------------------" + word + "------------------"); @@ -70,38 +70,4 @@ public class MFSEvaluatorTest { } - /** - * For a specific word, return the Semeval3 corresponding instances in form of - * {@link WSDSample} - * - * @param wordTag - * the word of which the instances are to be collected. wordTag has - * to be in the format "word.POS" (e.g., "activate.v", "smart.a", - * etc.) - * @return list of {@link WSDSample} instances of the wordTag - */ - protected static ArrayList<WSDSample> getTestData(String wordTag) { - - ArrayList<WSDSample> instances = new ArrayList<WSDSample>(); - for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) { - - String targetLemma = WSDHelper.getLemmatizer().lemmatize(wtd.getWord(), - wtd.getPosTag()); - - WSDSample sample = new WSDSample(wtd.getSentence(), wtd.getPosTags(), - wtd.getWordIndex(), targetLemma); - sample.setSenseIDs(wtd.getSenseIDs()); - - if (sample != null) { - if (sample.getSenseIDs().get(0) != null - && !sample.getSenseIDs().get(0).equalsIgnoreCase("U")) { - instances.add(sample); - } - } - - } - - return instances; - } - } Modified: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java (original) +++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java Thu Aug 20 22:01:59 2015 @@ -19,9 +19,11 @@ package opennlp.tools.disambiguator; +import java.util.ArrayList; import java.util.List; import opennlp.tools.disambiguator.mfs.MFS; +import opennlp.tools.util.Span; /** * This is a typical example of how to call the disambiguation function in the @@ -30,78 +32,83 @@ import opennlp.tools.disambiguator.mfs.M public class MFSTester { public static void main(String[] args) { - String modelsDir = "src\\test\\resources\\models\\"; - WSDHelper.loadTokenizer(modelsDir+"en-token.bin"); - WSDHelper.loadLemmatizer(modelsDir+"en-lemmatizer.dict"); - WSDHelper.loadTagger(modelsDir+"en-pos-maxent.bin"); - - + WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); + WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); + WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); + MFS mfs = new MFS(); - String test1 = "I went fishing for some sea bass."; - String[] sentence = WSDHelper.getTokenizer().tokenize(test1); - List<WordPOS> words = WSDHelper.getAllRelevantWords(sentence); - int targetWordIndex = 2; - String[] tags = new String[words.size()]; - String[] tokens = new String[words.size()]; - for (int i=0;i<words.size();i++){ - tags[i] = words.get(i).getPosTag(); - tokens[i] = words.get(i).getWord(); - - // Constants.print("token : "+ tokens[i] + "_" + tags[i]); + /** + * This is how to make the context for one-word-disambiguation using IMS + */ + String test1 = "We need to discuss important topic, please write to me soon."; + String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1); + String[] tags1 = WSDHelper.getTagger().tag(sentence1); + List<String> tempLemmas1 = new ArrayList<String>(); + for (int i = 0; i < sentence1.length; i++) { + String lemma = WSDHelper.getLemmatizer() + .lemmatize(sentence1[i], tags1[i]); + tempLemmas1.add(lemma); } - String targetLemma = WSDHelper.getLemmatizer().lemmatize( - tokens[targetWordIndex], tags[targetWordIndex]); - // Constants.print("lemma : "+ targetLemma); - - WSDHelper.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma)); - WSDHelper.printResults(mfs, - mfs.disambiguate(tokens, tags, targetWordIndex, targetLemma)); - WSDHelper.print("----------------------------------------"); - - String test2 = "it was a strong argument that his hypothesis was true"; - sentence = WSDHelper.getTokenizer().tokenize(test2); - words = WSDHelper.getAllRelevantWords(sentence); - targetWordIndex = 1; - tags = new String[words.size()]; - tokens = new String[words.size()]; - for (int i=0;i<words.size();i++){ - tags[i] = words.get(i).getPosTag(); - tokens[i] = words.get(i).getWord(); - - //Constants.print("token : "+ tokens[i] + "_" + tags[i]); + String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]); + + // output + String[] senses1 = mfs.disambiguate(sentence1, tags1, lemmas1, 8); + System.out.print(lemmas1[8] + " :\t"); + WSDHelper.print(senses1); + WSDHelper.print("*****************************"); + + /** + * This is how to make the context for disambiguation of span of words + */ + String test2 = "The component was highly radioactive to the point that" + + " it has been activated the second it touched water"; + String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2); + String[] tags2 = WSDHelper.getTagger().tag(sentence2); + List<String> tempLemmas2 = new ArrayList<String>(); + for (int i = 0; i < sentence2.length; i++) { + String lemma = WSDHelper.getLemmatizer() + .lemmatize(sentence2[i], tags2[i]); + tempLemmas2.add(lemma); } - targetLemma = WSDHelper.getLemmatizer().lemmatize( - tokens[targetWordIndex], tags[targetWordIndex]); - //Constants.print("lemma : "+ targetLemma); - - WSDHelper.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma)); - WSDHelper.printResults(mfs, - mfs.disambiguate(tokens, tags, targetWordIndex, targetLemma)); - WSDHelper.print("----------------------------------------"); - - String test3 = "the component was highly radioactive to the point that it has been activated the second it touched water"; - - sentence = WSDHelper.getTokenizer().tokenize(test3); - words = WSDHelper.getAllRelevantWords(sentence); - targetWordIndex = 4; - tags = new String[words.size()]; - tokens = new String[words.size()]; - for (int i=0;i<words.size();i++){ - tags[i] = words.get(i).getPosTag(); - tokens[i] = words.get(i).getWord(); - - //Constants.print("token : "+ tokens[i] + "_" + tags[i]); + String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]); + Span span = new Span(3, 7); + + // output + List<String[]> senses2 = mfs.disambiguate(sentence2, tags2, lemmas2, span); + for (int i = span.getStart(); i < span.getEnd() + 1; i++) { + String[] senses = senses2.get(i - span.getStart()); + System.out.print(lemmas2[i] + " :\t"); + WSDHelper.print(senses); + WSDHelper.print("----------"); } - targetLemma = WSDHelper.getLemmatizer().lemmatize( - tokens[targetWordIndex], tags[targetWordIndex]); - //Constants.print("lemma : "+ targetLemma); - - WSDHelper.print(mfs.disambiguate(tokens, tags, targetWordIndex,targetLemma)); - WSDHelper.printResults(mfs, - mfs.disambiguate(tokens, tags, targetWordIndex, targetLemma)); - WSDHelper.print("----------------------------------------"); + + WSDHelper.print("*****************************"); + + /** + * This is how to make the context for all-words-disambiguation + */ + String test3 = "The summer is almost over and I have not been to the beach even once"; + String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); + String[] tags3 = WSDHelper.getTagger().tag(sentence3); + List<String> tempLemmas3 = new ArrayList<String>(); + for (int i = 0; i < sentence3.length; i++) { + String lemma = WSDHelper.getLemmatizer() + .lemmatize(sentence3[i], tags3[i]); + tempLemmas3.add(lemma); + } + String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); + + // output + List<String[]> senses3 = mfs.disambiguate(sentence3, tags3, lemmas3); + for (int i = 0; i < sentence3.length; i++) { + String[] senses = senses3.get(i); + System.out.print(lemmas3[i] + " :\t"); + WSDHelper.print(senses); + WSDHelper.print("----------"); + } + } } \ No newline at end of file Modified: opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java URL: http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java?rev=1696865&r1=1696864&r2=1696865&view=diff ============================================================================== --- opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java (original) +++ opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java Thu Aug 20 22:01:59 2015 @@ -0,0 +1,39 @@ +package opennlp.tools.disambiguator; + +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.disambiguator.ims.IMS; + +public class Tester { + + public static void main(String[] args) { + + String modelsDir = "src\\test\\resources\\models\\"; + WSDHelper.loadTokenizer(modelsDir + "en-token.bin"); + WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict"); + WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin"); + + IMS ims = new IMS(); + + String test3 = "The summer is almost over and I haven't been to the beach even once"; + String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3); + String[] tags3 = WSDHelper.getTagger().tag(sentence3); + List<String> tempLemmas3 = new ArrayList<String>(); + for (int i = 0; i < sentence3.length; i++) { + String lemma = WSDHelper.getLemmatizer() + .lemmatize(sentence3[i], tags3[i]); + tempLemmas3.add(lemma); + } + String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]); + + // output + List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3); + for (int i = 0; i < sentence3.length; i++) { + System.out.print(sentence3[i] + " : "); + WSDHelper.printResults(ims, senses3.get(i)); + WSDHelper.print("----------"); + } + + } +} \ No newline at end of file
