Author: joern
Date: Thu Jul 2 16:08:55 2015
New Revision: 1688852
URL: http://svn.apache.org/r1688852
Log:
OPENNLP-758 Applied clean up patch. Thanks to Anthony Beylerian for providing
a patch.
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
(with props)
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
Thu Jul 2 16:08:55 2015
@@ -22,6 +22,8 @@ package opennlp.tools.disambiguator;
import java.util.ArrayList;
import java.util.Arrays;
+import opennlp.tools.disambiguator.lesk.Lesk;
+import net.sf.extjwnl.JWNLException;
import net.sf.extjwnl.data.POS;
public class Constants {
@@ -125,22 +127,62 @@ public class Constants {
"you're", "yours", "yourself", "yourselves", "you've", "zero"));
// Print a text in the console
+ public static void printResults(WSDisambiguator disambiguator,
+ String[] results) {
+
+ if (results != null) {
+
+ if (disambiguator instanceof Lesk) {
+ POS pos;
+ long offset;
+ double score;
+ String[] parts;
+
+ for (String result : results) {
+ parts = result.split("@");
+ pos = POS.getPOSForKey(parts[0]);
+ offset = Long.parseLong(parts[1]);
+ score = Double.parseDouble(parts[2]);
+ try {
+ Constants.print("score : " + score + " for : "
+ + Loader.getDictionary().getSynsetAt(pos, offset).getGloss());
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ }
+
public static void print(Object in) {
- System.out.println(in);
+ if (in == null) {
+ System.out.println("object is null");
+ } else {
+ System.out.println(in);
+ }
}
public static void print(Object[] array) {
- System.out.println(Arrays.asList(array));
+ if (array == null) {
+ System.out.println("object is null");
+ } else {
+ System.out.println(Arrays.asList(array));
+ }
}
public static void print(Object[][] array) {
- System.out.print("[");
- for (int i = 0; i < array.length; i++) {
- print(array[i]);
- if (i != array.length - 1) {
- System.out.print("\n");
+ if (array == null) {
+ System.out.println("object is null");
+ } else {
+ System.out.print("[");
+ for (int i = 0; i < array.length; i++) {
+ print(array[i]);
+ if (i != array.length - 1) {
+ System.out.print("\n");
+ }
+ print("]");
}
- print("]");
}
}
@@ -169,6 +211,15 @@ public class Constants {
}
+ public static boolean isRelevant(String posTag) {
+ return getPOS(posTag) != null;
+ }
+
+ public static boolean isRelevant(POS pos) {
+ return pos.equals(POS.ADJECTIVE) || pos.equals(POS.ADVERB)
+ || pos.equals(POS.NOUN) || pos.equals(POS.VERB);
+ }
+
// Check whether a list of arrays contains an array
public static boolean belongsTo(String[] array, ArrayList<String[]>
fullList) {
for (String[] refArray : fullList) {
@@ -196,5 +247,4 @@ public class Constants {
return true;
}
-
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
Thu Jul 2 16:08:55 2015
@@ -41,10 +41,6 @@ import opennlp.tools.disambiguator.ims.W
public class DataExtractor {
- /**
- * Constructor
- */
-
public DataExtractor() {
super();
}
@@ -52,7 +48,6 @@ public class DataExtractor {
/**
* Extract the dictionary from the dictionary XML file and map the senses
*/
-
private ArrayList<DictionaryInstance> extractDictionary(String xmlLocation) {
ArrayList<DictionaryInstance> dictionary = new
ArrayList<DictionaryInstance>();
@@ -283,9 +278,9 @@ public class DataExtractor {
* Extract the training instances from the training/test set File
*/
- public HashMap<Integer, WTDIMS> extractWSDInstances(String xmlDataSet) {
+ public ArrayList<WTDIMS> extractWSDInstances(String xmlDataSet) {
- HashMap<Integer, WTDIMS> setInstances = new HashMap<Integer, WTDIMS>();
+ ArrayList<WTDIMS> setInstances = new ArrayList<WTDIMS>();
try {
@@ -298,8 +293,6 @@ public class DataExtractor {
NodeList lexelts = doc.getElementsByTagName("lexelt");
- int index = 0;
-
for (int i = 0; i < lexelts.getLength(); i++) {
Node nLexelt = lexelts.item(i);
@@ -371,16 +364,12 @@ public class DataExtractor {
WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
rawWord);
- setInstances.put(index, wordToDisambiguate);
- index++;
+ setInstances.add(wordToDisambiguate);
// System.out.print(index + "\t");
// System.out.println(wordToDisambiguate.toString());
}
-
}
-
}
-
}
} catch (Exception e) {
@@ -390,5 +379,4 @@ public class DataExtractor {
return setInstances;
}
-
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
Thu Jul 2 16:08:55 2015
@@ -20,8 +20,7 @@
package opennlp.tools.disambiguator;
public class DictionaryInstance {
-
-
+
protected int index;
protected String word;
@@ -31,7 +30,6 @@ public class DictionaryInstance {
protected String[] synset;
protected String gloss;
-
public DictionaryInstance(int index, String word, String id, String source,
String[] synset, String gloss) {
super();
@@ -43,6 +41,10 @@ public class DictionaryInstance {
this.gloss = gloss;
}
+ /**
+ * Getters and Setters
+ */
+
public int getIndex() {
return index;
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
Thu Jul 2 16:08:55 2015
@@ -20,8 +20,6 @@
package opennlp.tools.disambiguator;
-
-
import java.util.ArrayList;
import opennlp.tools.disambiguator.ims.WTDIMS;
@@ -44,8 +42,6 @@ public class FeaturesExtractor {
* Collocations: it requires one parameter: "the n-gram"
*
*/
-
- // private methods
private String[] extractPosOfSurroundingWords(String[] sentence,
int wordIndex, int windowSize) {
@@ -94,7 +90,7 @@ public class FeaturesExtractor {
private String[] extractLocalCollocations(String[] sentence, int wordIndex,
int ngram) {
- /*
+ /**
* Here the author used only 11 features of this type. the range was set to
* 3 (bigrams extracted in a way that they are at max separated by 1 word).
*/
@@ -124,6 +120,7 @@ public class FeaturesExtractor {
return res;
}
+ // public method
/**
* This method generates the different set of features related to the IMS
* approach and store them in the corresponding attributes of the WTDIMS
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Node.java
Thu Jul 2 16:08:55 2015
@@ -31,7 +31,6 @@ import net.sf.extjwnl.data.list.PointerT
/**
* Convenience class to access some features.
*/
-
public class Node {
public Synset parent;
@@ -69,6 +68,10 @@ public class Node {
public String getSense() {
return this.synset.getGloss().toString();
}
+
+ public long getSenseID() {
+ return this.synset.getOffset();
+ }
public void setHypernyms() {
// PointerUtils pointerUtils = PointerUtils.get();
@@ -168,5 +171,4 @@ public class Node {
public ArrayList<WordPOS> getSynonyms() {
return synonyms;
}
-
}
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java?rev=1688852&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
Thu Jul 2 16:08:55 2015
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+/**
+ * Disambiguation Parameters
+ *
+ */
+public abstract class WSDParameters {
+
+ protected boolean isCoarseSense;
+
+ /**
+ * @return if the disambiguation type is coarse grained or fine grained
+ */
+ public boolean isCoarseSense() {
+ return isCoarseSense;
+ }
+
+ public void setCoarseSense(boolean isCoarseSense) {
+ this.isCoarseSense = isCoarseSense;
+ }
+
+ public WSDParameters(){
+ this.isCoarseSense = true;
+ }
+
+ /**
+ * @return checks if the parameters are valid or not
+ */
+ public abstract boolean isValid();
+
+}
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
Thu Jul 2 16:08:55 2015
@@ -19,14 +19,49 @@
package opennlp.tools.disambiguator;
+import java.security.InvalidParameterException;
import opennlp.tools.util.Span;
+
/**
- * The interface for word sense disambiguators.
+ * A word sense disambiguator that determines which sense of a word is meant
in a particular context.
+ * It is a classification task, where the classes are the different senses of
the ambiguous word.
+ * Disambiguation can be achieved in either supervised or un-supervised
approaches.
+ * For the moment this component relies on WordNet to retrieve sense
definitions.
+ * It returns an array of WordNet sense IDs ordered by their disambiguation
score.
+ * The sense with highest score is the most likely sense of the word.
+ *
+ * Please see {@link Lesk} for an un-supervised approach.
+ * Please see {@link IMS} for a supervised approach.
+ *
+ * @see Lesk
+ * @see IMS
*/
public interface WSDisambiguator {
- public String[] disambiguate(String[] inputText, int inputWordIndex);
+
+ /**
+ * @return the parameters of the disambiguation algorithm
+ */
+ public WSDParameters getParams();
+
+ /**
+ * @param the disambiguation implementation specific parameters.
+ * @throws InvalidParameterException
+ */
+ public void setParams(WSDParameters params) throws InvalidParameterException;
+
+ /**
+ * @param tokenizedContext
+ * @param ambiguousTokenIndex
+ * @return result as an array of WordNet IDs
+ */
+ public String[] disambiguate(String[] tokenizedContext, int
ambiguousTokenIndex);
- public String[] disambiguate(String[] inputText, Span[] inputWordSpans);
+ /**
+ * @param tokenizedContext
+ * @param ambiguousTokenIndexSpans
+ * @return result as an array of WordNet IDs
+ */
+ public String[][] disambiguate(String[] tokenizedContext, Span[]
ambiguousTokenIndexSpans);
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
Thu Jul 2 16:08:55 2015
@@ -28,13 +28,13 @@ import net.sf.extjwnl.data.IndexWord;
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Synset;
+// TODO extend Word instead
public class WordPOS {
private String word;
private List stems;
private POS pos;
- // Constructor
public WordPOS(String word, POS pos) throws IllegalArgumentException {
if (word == null || pos == null) {
throw new IllegalArgumentException("Args are null");
@@ -106,5 +106,4 @@ public class WordPOS {
}
return false;
}
-
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
Thu Jul 2 16:08:55 2015
@@ -25,160 +25,141 @@ import net.sf.extjwnl.data.POS;
-
public class WordToDisambiguate {
-
- // TODO Check if it is necessary to add an attribute [word] since the
word in the sentence is not necessarily in the base form ??
-
- protected String [] sentence;
- protected String [] posTags;
-
- protected int wordIndex;
-
- protected int sense;
-
- protected ArrayList<String> senseID;
-
-
-
- /**
- * Constructor
- */
-
-
- public WordToDisambiguate(String[] sentence, int wordIndex, int sense)
throws IllegalArgumentException{
- super();
-
- if (wordIndex>sentence.length){
- throw new IllegalArgumentException("The index is out of
bounds !");
- }
-
- this.sentence = sentence;
- this.posTags = PreProcessor.tag(sentence);
-
- this.wordIndex = wordIndex;
-
- this.sense = sense;
- }
-
- public WordToDisambiguate(String[] sentence, int wordIndex) {
- this(sentence,wordIndex,-1);
- }
-
- public WordToDisambiguate() {
- String[] emptyString = {};
- int emptyInteger = 0;
-
- this.sentence = emptyString;
- this.wordIndex = emptyInteger;
- this.sense = -1;
-
- }
-
-
- /**
- * Getters and Setters
- */
-
- // Sentence
- public String[] getSentence() {
- return sentence;
- }
-
- public void setSentence(String[] sentence) {
- this.sentence = sentence;
- }
-
-
- // Sentence Pos-Tags
- public String[] getPosTags() {
- return posTags;
- }
-
- public void setPosTags(String[] posTags) {
- this.posTags = posTags;
- }
-
-
- // Word to disambiguate
- public int getWordIndex() {
- return wordIndex;
- }
-
- public String getRawWord() {
-
- /**
- * For example, from the word "running" it returns "run.v"
- */
-
- String wordBaseForm =
Loader.getLemmatizer().lemmatize(this.sentence[wordIndex],
this.posTags[wordIndex]);
-
- String ref = "";
-
- if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.VERB))
{
- ref = wordBaseForm + ".v";
- } else if
(Constants.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) {
- ref = wordBaseForm + ".n";
- } else if
(Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADJECTIVE)) {
- ref = wordBaseForm + ".a";
- } else if
(Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) {
- ref = wordBaseForm + ".r";
- } else {
-
- }
-
- return ref;
-
- }
-
- public String getWord() {
- return this.sentence[this.wordIndex];
- }
-
- public String getPosTag() {
- return this.posTags[this.wordIndex];
- }
-
- public void setWordIndex(int wordIndex) {
- this.wordIndex = wordIndex;
- }
-
-
-
-
- // Word to disambiguate sense
- public int getSense() {
- return sense;
- }
-
- public void setSense(int sense) {
- this.sense = sense;
- }
-
-
-
- // Sense as in the source
- // TODO fix the conflict between this ID of the sense and that in the
attribute [sense]
- public ArrayList<String> getSenseID() {
- return senseID;
- }
-
- public void setSenseID(ArrayList<String> senseID) {
- this.senseID = senseID;
- }
-
-
-
-
- /**
- * toString
- */
-
- public String toString() {
- return (wordIndex + "\t" + getWord() + "\n" + sentence);
- }
-
-
+ // TODO Check if it is necessary to add an attribute [word] since the word in
+ // the sentence is not necessarily in the base form ??
+
+ protected String[] sentence;
+ protected String[] posTags;
+
+ protected int wordIndex;
+
+ protected int sense;
+
+ protected ArrayList<String> senseIDs;
+
+ /**
+ * Constructor
+ */
+
+ public WordToDisambiguate(String[] sentence, int wordIndex, int sense)
+ throws IllegalArgumentException {
+ super();
+
+ if (wordIndex > sentence.length) {
+ throw new IllegalArgumentException("The index is out of bounds !");
+ }
+
+ this.sentence = sentence;
+ this.posTags = PreProcessor.tag(sentence);
+
+ this.wordIndex = wordIndex;
+
+ this.sense = sense;
+ }
+
+ public WordToDisambiguate(String[] sentence, int wordIndex) {
+ this(sentence, wordIndex, -1);
+ }
+
+ public WordToDisambiguate() {
+ String[] emptyString = {};
+ int emptyInteger = 0;
+
+ this.sentence = emptyString;
+ this.wordIndex = emptyInteger;
+ this.sense = -1;
+
+ }
+
+ /**
+ * Getters and Setters
+ */
+
+ // Sentence
+ public String[] getSentence() {
+ return sentence;
+ }
+
+ public void setSentence(String[] sentence) {
+ this.sentence = sentence;
+ }
+
+ // Sentence Pos-Tags
+ public String[] getPosTags() {
+ return posTags;
+ }
+
+ public void setPosTags(String[] posTags) {
+ this.posTags = posTags;
+ }
+
+ // Word to disambiguate
+ public int getWordIndex() {
+ return wordIndex;
+ }
+
+ public String getRawWord() {
+
+ /**
+ * For example, from the word "running" it returns "run.v"
+ */
+
+ String wordBaseForm = Loader.getLemmatizer().lemmatize(
+ this.sentence[wordIndex], this.posTags[wordIndex]);
+
+ String ref = "";
+
+ if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.VERB)) {
+ ref = wordBaseForm + ".v";
+ } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.NOUN)) {
+ ref = wordBaseForm + ".n";
+ } else if
(Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADJECTIVE)) {
+ ref = wordBaseForm + ".a";
+ } else if (Constants.getPOS(this.posTags[wordIndex]).equals(POS.ADVERB)) {
+ ref = wordBaseForm + ".r";
+ } else {
+
+ }
+
+ return ref;
+
+ }
+
+ public String getWord() {
+ return this.sentence[this.wordIndex];
+ }
+
+ public String getPosTag() {
+ return this.posTags[this.wordIndex];
+ }
+
+ public void setWordIndex(int wordIndex) {
+ this.wordIndex = wordIndex;
+ }
+
+ // Word to disambiguate sense
+ public int getSense() {
+ return sense;
+ }
+
+ public void setSense(int sense) {
+ this.sense = sense;
+ }
+
+ // Sense as in the source
+ // TODO fix the conflict between this ID of the sense and that in the
+ // attribute [sense]
+ public ArrayList<String> getSenseIDs() {
+ return senseIDs;
+ }
+
+ public void setSenseIDs(ArrayList<String> senseIDs) {
+ this.senseIDs = senseIDs;
+ }
+ public String toString() {
+ return (wordIndex + "\t" + getWord() + "\n" + sentence);
+ }
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
Thu Jul 2 16:08:55 2015
@@ -34,8 +34,8 @@ public class DefaultIMSContextGenerator
/**
* Default context generator for IMS.
*/
- public DefaultIMSContextGenerator() {
+ public DefaultIMSContextGenerator() {
}
/**
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
Thu Jul 2 16:08:55 2015
@@ -35,6 +35,7 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
+import java.security.InvalidParameterException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.zip.GZIPInputStream;
@@ -51,6 +52,7 @@ import opennlp.tools.disambiguator.Const
import opennlp.tools.disambiguator.DataExtractor;
import opennlp.tools.disambiguator.FeaturesExtractor;
import opennlp.tools.disambiguator.PreProcessor;
+import opennlp.tools.disambiguator.WSDParameters;
import opennlp.tools.disambiguator.WordPOS;
import opennlp.tools.disambiguator.WSDisambiguator;
@@ -64,17 +66,11 @@ public class IMS implements WSDisambigua
private FeaturesExtractor fExtractor = new FeaturesExtractor();
private DataExtractor dExtractor = new DataExtractor();
- /**
- * PARAMETERS
- */
private int windowSize;
+ private int word;
private int ngram;
- /**
- * Constructors
- */
-
public IMS() {
super();
windowSize = 3;
@@ -95,11 +91,7 @@ public class IMS implements WSDisambigua
this.cg = factory.createContextGenerator();
}
- /**
- * INTERNAL METHODS
- */
-
- protected HashMap<Integer, WTDIMS> extractTrainingData(
+ protected ArrayList<WTDIMS> extractTrainingData(
String wordTrainingxmlFile,
HashMap<String, ArrayList<DictionaryInstance>> senses) {
@@ -108,18 +100,18 @@ public class IMS implements WSDisambigua
* etc.)
*/
- HashMap<Integer, WTDIMS> trainingData = dExtractor
+ ArrayList<WTDIMS> trainingData = dExtractor
.extractWSDInstances(wordTrainingxmlFile);
// HashMap<Integer, WTDIMS> trainingData =
// dExtractor.extractWSDInstances(wordTrainingxmlFile);
- for (Integer key : trainingData.keySet()) {
- for (String senseId : trainingData.get(key).getSenseID()) {
+ for (WTDIMS data : trainingData) {
+ for (String senseId : data.getSenseIDs()) {
for (String dictKey : senses.keySet()) {
for (DictionaryInstance instance : senses.get(dictKey)) {
if (senseId.equals(instance.getId())) {
- trainingData.get(key).setSense(
+ data.setSense(
Integer.parseInt(dictKey.split("_")[1]));
break;
}
@@ -131,11 +123,11 @@ public class IMS implements WSDisambigua
return trainingData;
}
- protected void extractFeature(HashMap<Integer, WTDIMS> words) {
+ protected void extractFeature(ArrayList<WTDIMS> words) {
- for (Integer key : words.keySet()) {
+ for (WTDIMS word : words) {
- fExtractor.extractIMSFeatures(words.get(key), windowSize, ngram);
+ fExtractor.extractIMSFeatures(word, windowSize, ngram);
}
@@ -217,18 +209,18 @@ public class IMS implements WSDisambigua
HashMap<String, ArrayList<DictionaryInstance>> senses = dExtractor
.extractWordSenses(dict, map, wordTag);
- HashMap<Integer, WTDIMS> instances = extractTrainingData(
+ ArrayList<WTDIMS> instances = extractTrainingData(
wordTrainingxmlFile, senses);
extractFeature(instances);
ArrayList<Event> events = new ArrayList<Event>();
- for (int key : instances.keySet()) {
+ for (WTDIMS instance : instances) {
- int sense = instances.get(key).getSense();
+ int sense = instance.getSense();
- String[] context = cg.getContext(instances.get(key));
+ String[] context = cg.getContext(instance);
Event ev = new Event(sense + "", context);
@@ -338,9 +330,16 @@ public class IMS implements WSDisambigua
}
@Override
- public String[] disambiguate(String[] inputText, Span[] inputWordSpans) {
- // TODO Auto-generated method stub
+ public String[][] disambiguate(String[] inputText, Span[] inputWordSpans) {
return null;
}
+ @Override
+ public WSDParameters getParams() {
+ return null;
+ }
+
+ @Override
+ public void setParams(WSDParameters params) throws InvalidParameterException
{
+ }
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
Thu Jul 2 16:08:55 2015
@@ -25,5 +25,4 @@ package opennlp.tools.disambiguator.ims;
public interface IMSContextGenerator {
public String[] getContext(WTDIMS word);
-
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
Thu Jul 2 16:08:55 2015
@@ -99,4 +99,3 @@ public class IMSFactory extends BaseTool
public void validateArtifactMap() throws InvalidFormatException {
}
}
-
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
Thu Jul 2 16:08:55 2015
@@ -19,8 +19,6 @@
package opennlp.tools.disambiguator.ims;
-
-
import java.util.ArrayList;
import opennlp.tools.disambiguator.PreProcessor;
@@ -34,7 +32,6 @@ public class WTDIMS extends WordToDisamb
public WTDIMS(String[] sentence, int word, int sense) {
super(sentence, word, sense);
-
}
public WTDIMS(String[] sentence, int word) {
@@ -57,8 +54,7 @@ public class WTDIMS extends WordToDisamb
}
}
- this.senseID = xmlAnswers;
-
+ this.senseIDs = xmlAnswers;
}
public String[] getPosOfSurroundingWords() {
@@ -85,4 +81,3 @@ public class WTDIMS extends WordToDisamb
this.localCollocations = localCollocations;
}
}
-
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
Thu Jul 2 16:08:55 2015
@@ -21,13 +21,13 @@ package opennlp.tools.disambiguator.lesk
import java.security.InvalidParameterException;
import java.util.ArrayList;
-
import java.util.Collections;
import opennlp.tools.disambiguator.Constants;
import opennlp.tools.disambiguator.Loader;
import opennlp.tools.disambiguator.Node;
import opennlp.tools.disambiguator.PreProcessor;
+import opennlp.tools.disambiguator.WSDParameters;
import opennlp.tools.disambiguator.WSDisambiguator;
import opennlp.tools.disambiguator.WordPOS;
import opennlp.tools.disambiguator.WordSense;
@@ -35,36 +35,68 @@ import opennlp.tools.util.Span;
import net.sf.extjwnl.data.Synset;
/**
- * Class for the Lesk algorithm and variants.
+ * Implementation of the <b>Overlap Of Senses</b> approach originally proposed
by Lesk.
+ * The main idea is to check for word overlaps in the sense definitions of the
surrounding context.
+ * An overlap is when two words have similar stems.
+ * The more overlaps a word has the higher its score.
+ * Different variations of the approach are included in this class.
+ *
*/
-
public class Lesk implements WSDisambiguator {
+ /**
+ * The lesk specific parameters
+ */
protected LeskParameters params;
-
- public Loader loader;
-
+
public Lesk() {
this(null);
}
+ /**
+ * Initializes the loader object and sets the input parameters
+ * @param Input Parameters
+ * @throws InvalidParameterException
+ */
public Lesk(LeskParameters params) throws InvalidParameterException {
- loader = new Loader();
+ Loader loader = new Loader();
this.setParams(params);
}
+
- public void setParams(LeskParameters params) throws
InvalidParameterException {
+ /**
+ * If the parameters are null set the default ones, else only set them if
they valid.
+ * Invalid parameters will return a exception
+ *
+ * @param Input parameters
+ * @throws InvalidParameterException
+ */
+ @Override
+ public void setParams(WSDParameters params) throws InvalidParameterException
{
if (params == null) {
this.params = new LeskParameters();
} else {
if (params.isValid()) {
- this.params = params;
+ this.params = (LeskParameters) params;
} else {
throw new InvalidParameterException("wrong params");
}
}
}
+ /**
+ * @return the parameter settings
+ */
+ public LeskParameters getParams() {
+ return params;
+ }
+
+ /**
+ * The basic Lesk method where the entire context is considered for overlaps
+ *
+ * @param The word to disambiguate
+ * @return The array of WordSenses with their scores
+ */
public ArrayList<WordSense> basic(WTDLesk wtd) {
ArrayList<WordPOS> relvWords = PreProcessor.getAllRelevantWords(wtd);
@@ -98,14 +130,31 @@ public class Lesk implements WSDisambigu
return scoredSenses;
}
+ /**
+ * The basic Lesk method but applied to a default context windows
+ * @param The word to disambiguate
+ * @return The array of WordSenses with their scores
+ */
public ArrayList<WordSense> basicContextual(WTDLesk wtd) {
return this.basicContextual(wtd, LeskParameters.DFLT_WIN_SIZE);
}
+ /**
+ * The basic Lesk method but applied to a custom context windows
+ * @param The word to disambiguate
+ * @param windowSize
+ * @return The array of WordSenses with their scores
+ */
public ArrayList<WordSense> basicContextual(WTDLesk wtd, int windowSize) {
return this.basicContextual(wtd, windowSize, windowSize);
}
+ /**
+ * The basic Lesk method but applied to a context windows set by custom
backward and forward window lengths
+ * @param wtd the word to disambiguate
+ * @param windowBackward
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> basicContextual(WTDLesk wtd, int windowBackward,
int windowForward) {
@@ -146,6 +195,19 @@ public class Lesk implements WSDisambigu
return scoredSenses;
}
+ /**
+ * An extended version of the Lesk approach that takes into consideration
semantically related feature overlaps across the entire context
+ * The scoring function uses linear weights.
+ * @param wtd the word to disambiguate
+ * @param depth how deep to go into each feature tree
+ * @param depthScoreWeight the weighing per depth level
+ * @param includeSynonyms
+ * @param includeHypernyms
+ * @param includeHyponyms
+ * @param includeMeronyms
+ * @param includeHolonyms
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> extended(WTDLesk wtd, int depth,
double depthScoreWeight, boolean includeSynonyms,
boolean includeHypernyms, boolean includeHyponyms,
@@ -156,6 +218,19 @@ public class Lesk implements WSDisambigu
}
+ /**
+ * An extended version of the Lesk approach that takes into consideration
semantically related feature overlaps in a default context window
+ * The scoring function uses linear weights.
+ * @param wtd the word to disambiguate
+ * @param depth how deep to go into each feature tree
+ * @param depthScoreWeight the weighing per depth level
+ * @param includeSynonyms
+ * @param includeHypernyms
+ * @param includeHyponyms
+ * @param includeMeronyms
+ * @param includeHolonyms
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> extendedContextual(WTDLesk wtd, int depth,
double depthScoreWeight, boolean includeSynonyms,
boolean includeHypernyms, boolean includeHyponyms,
@@ -167,6 +242,20 @@ public class Lesk implements WSDisambigu
}
+ /**
+ * An extended version of the Lesk approach that takes into consideration
semantically related feature overlaps in a custom context window
+ * The scoring function uses linear weights.
+ * @param wtd the word to disambiguate
+ * @param windowSize the custom context window size
+ * @param depth how deep to go into each feature tree
+ * @param depthScoreWeight the weighing per depth level
+ * @param includeSynonyms
+ * @param includeHypernyms
+ * @param includeHyponyms
+ * @param includeMeronyms
+ * @param includeHolonyms
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> extendedContextual(WTDLesk wtd, int windowSize,
int depth, double depthScoreWeight, boolean includeSynonyms,
boolean includeHypernyms, boolean includeHyponyms,
@@ -177,6 +266,22 @@ public class Lesk implements WSDisambigu
includeMeronyms, includeHolonyms);
}
+
+ /**
+ * An extended version of the Lesk approach that takes into consideration
semantically related feature overlaps in a custom context window
+ * The scoring function uses linear weights.
+ * @param wtd the word to disambiguate
+ * @param windowBackward the custom context backward window size
+ * @param windowForward the custom context forward window size
+ * @param depth how deep to go into each feature tree
+ * @param depthScoreWeight the weighing per depth level
+ * @param includeSynonyms
+ * @param includeHypernyms
+ * @param includeHyponyms
+ * @param includeMeronyms
+ * @param includeHolonyms
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> extendedContextual(WTDLesk wtd,
int windowBackward, int windowForward, int depth,
double depthScoreWeight, boolean includeSynonyms,
@@ -236,6 +341,21 @@ public class Lesk implements WSDisambigu
}
+
+ /**
+ * An extended version of the Lesk approach that takes into consideration
semantically related feature overlaps in all the context.
+ * The scoring function uses exponential weights.
+ * @param wtd the word to disambiguate
+ * @param depth how deep to go into each feature tree
+ * @param intersectionExponent
+ * @param depthExponent
+ * @param includeSynonyms
+ * @param includeHypernyms
+ * @param includeHyponyms
+ * @param includeMeronyms
+ * @param includeHolonyms
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> extendedExponential(WTDLesk wtd, int depth,
double intersectionExponent, double depthExponent,
boolean includeSynonyms, boolean includeHypernyms,
@@ -246,7 +366,21 @@ public class Lesk implements WSDisambigu
includeMeronyms, includeHolonyms);
}
-
+
+ /**
+ * An extended version of the Lesk approach that takes into consideration
semantically related feature overlaps in a default window in the context.
+ * The scoring function uses exponential weights.
+ * @param wtd the word to disambiguate
+ * @param depth how deep to go into each feature tree
+ * @param intersectionExponent
+ * @param depthExponent
+ * @param includeSynonyms
+ * @param includeHypernyms
+ * @param includeHyponyms
+ * @param includeMeronyms
+ * @param includeHolonyms
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> extendedExponentialContextual(WTDLesk wtd,
int depth, double intersectionExponent, double depthExponent,
boolean includeSynonyms, boolean includeHypernyms,
@@ -256,7 +390,22 @@ public class Lesk implements WSDisambigu
depth, intersectionExponent, depthExponent, includeSynonyms,
includeHypernyms, includeHyponyms, includeMeronyms, includeHolonyms);
}
-
+
+ /**
+ * An extended version of the Lesk approach that takes into consideration
semantically related feature overlaps in a custom window in the context.
+ * The scoring function uses exponential weights.
+ * @param wtd the word to disambiguate
+ * @param windowSize
+ * @param depth how deep to go into each feature tree
+ * @param intersectionExponent
+ * @param depthExponent
+ * @param includeSynonyms
+ * @param includeHypernyms
+ * @param includeHyponyms
+ * @param includeMeronyms
+ * @param includeHolonyms
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> extendedExponentialContextual(WTDLesk wtd,
int windowSize, int depth, double intersectionExponent,
double depthExponent, boolean includeSynonyms, boolean includeHypernyms,
@@ -267,6 +416,22 @@ public class Lesk implements WSDisambigu
includeHyponyms, includeMeronyms, includeHolonyms);
}
+ /**
+ * An extended version of the Lesk approach that takes into consideration
semantically related feature overlaps in a custom window in the context.
+ * The scoring function uses exponential weights.
+ * @param wtd the word to disambiguate
+ * @param windowBackward
+ * @param windowForward
+ * @param depth
+ * @param intersectionExponent
+ * @param depthExponent
+ * @param includeSynonyms
+ * @param includeHypernyms
+ * @param includeHyponyms
+ * @param includeMeronyms
+ * @param includeHolonyms
+ * @return the array of WordSenses with their scores
+ */
public ArrayList<WordSense> extendedExponentialContextual(WTDLesk wtd,
int windowBackward, int windowForward, int depth,
double intersectionExponent, double depthExponent,
@@ -327,6 +492,15 @@ public class Lesk implements WSDisambigu
}
+ /**
+ * Recursively score the hypernym tree linearly
+ * @param wordSense
+ * @param child
+ * @param relvWords
+ * @param depth
+ * @param maxDepth
+ * @param depthScoreWeight
+ */
private void fathomHypernyms(WordSense wordSense, Synset child,
ArrayList<WordPOS> relvWords, int depth, int maxDepth,
double depthScoreWeight) {
@@ -350,6 +524,16 @@ public class Lesk implements WSDisambigu
}
}
+ /**
+ * Recursively score the hypernym tree exponentially
+ * @param wordSense
+ * @param child
+ * @param relvWords
+ * @param depth
+ * @param maxDepth
+ * @param intersectionExponent
+ * @param depthScoreExponent
+ */
private void fathomHypernymsExponential(WordSense wordSense, Synset child,
ArrayList<WordPOS> relvWords, int depth, int maxDepth,
double intersectionExponent, double depthScoreExponent) {
@@ -374,6 +558,15 @@ public class Lesk implements WSDisambigu
}
}
+ /**
+ * Recursively score the hyponym tree linearly
+ * @param wordSense
+ * @param child
+ * @param relvWords
+ * @param depth
+ * @param maxDepth
+ * @param depthScoreWeight
+ */
private void fathomHyponyms(WordSense wordSense, Synset child,
ArrayList<WordPOS> relvWords, int depth, int maxDepth,
double depthScoreWeight) {
@@ -398,6 +591,16 @@ public class Lesk implements WSDisambigu
}
}
+ /**
+ * Recursively score the hyponym tree exponentially
+ * @param wordSense
+ * @param child
+ * @param relvWords
+ * @param depth
+ * @param maxDepth
+ * @param intersectionExponent
+ * @param depthScoreExponent
+ */
private void fathomHyponymsExponential(WordSense wordSense, Synset child,
ArrayList<WordPOS> relvWords, int depth, int maxDepth,
double intersectionExponent, double depthScoreExponent) {
@@ -422,6 +625,15 @@ public class Lesk implements WSDisambigu
}
}
+ /**
+ * Recursively score the meronym tree linearly
+ * @param wordSense
+ * @param child
+ * @param relvWords
+ * @param depth
+ * @param maxDepth
+ * @param depthScoreWeight
+ */
private void fathomMeronyms(WordSense wordSense, Synset child,
ArrayList<WordPOS> relvWords, int depth, int maxDepth,
double depthScoreWeight) {
@@ -446,6 +658,16 @@ public class Lesk implements WSDisambigu
}
}
+ /**
+ * Recursively score the meronym tree exponentially
+ * @param wordSense
+ * @param child
+ * @param relvWords
+ * @param depth
+ * @param maxDepth
+ * @param intersectionExponent
+ * @param depthScoreExponent
+ */
private void fathomMeronymsExponential(WordSense wordSense, Synset child,
ArrayList<WordPOS> relvWords, int depth, int maxDepth,
double intersectionExponent, double depthScoreExponent) {
@@ -470,6 +692,15 @@ public class Lesk implements WSDisambigu
}
}
+ /**
+ * Recursively score the holonym tree linearly
+ * @param wordSense
+ * @param child
+ * @param relvWords
+ * @param depth
+ * @param maxDepth
+ * @param depthScoreWeight
+ */
private void fathomHolonyms(WordSense wordSense, Synset child,
ArrayList<WordPOS> relvWords, int depth, int maxDepth,
double depthScoreWeight) {
@@ -494,6 +725,16 @@ public class Lesk implements WSDisambigu
}
}
+ /**
+ * Recursively score the holonym tree exponentially
+ * @param wordSense
+ * @param child
+ * @param relvWords
+ * @param depth
+ * @param maxDepth
+ * @param intersectionExponent
+ * @param depthScoreExponent
+ */
private void fathomHolonymsExponential(WordSense wordSense, Synset child,
ArrayList<WordPOS> relvWords, int depth, int maxDepth,
double intersectionExponent, double depthScoreExponent) {
@@ -518,6 +759,12 @@ public class Lesk implements WSDisambigu
}
}
+ /**
+ * Checks if the feature should be counted in the score
+ * @param featureSynsets
+ * @param relevantWords
+ * @return count of features to consider
+ */
private int assessFeature(ArrayList<Synset> featureSynsets,
ArrayList<WordPOS> relevantWords) {
int count = 0;
@@ -540,25 +787,32 @@ public class Lesk implements WSDisambigu
return count;
}
+ /**
+ * Checks if the synonyms should be counted in the score
+ * @param synonyms
+ * @param relevantWords
+ * @return count of synonyms to consider
+ */
private int assessSynonyms(ArrayList<WordPOS> synonyms,
ArrayList<WordPOS> relevantWords) {
int count = 0;
for (WordPOS synonym : synonyms) {
for (WordPOS sentenceWord : relevantWords) {
- // TODO try to switch to lemmatizer
if (sentenceWord.isStemEquivalent(synonym)) {
count = count + 1;
}
}
-
}
-
return count;
}
+ /**
+ * Gets the senses of the nodes
+ * @param nodes
+ * @return senses from the nodes
+ */
public ArrayList<WordSense> updateSenses(ArrayList<Node> nodes) {
-
ArrayList<WordSense> scoredSenses = new ArrayList<WordSense>();
for (int i = 0; i < nodes.size(); i++) {
@@ -573,12 +827,25 @@ public class Lesk implements WSDisambigu
return scoredSenses;
}
-
- // disambiguates a WTDLesk and returns an array of sense indexes from WordNet
- // ordered by their score
+
+ /**
+ * Disambiguates an ambiguous word in its context
+ *
+ * @param tokenizedContext
+ * @param ambiguousTokenIndex
+ * @return array of sense indexes from WordNet ordered by their score.
+ * The result format is <b>POS</b>@<b>SenseID</b>@<b>Sense Score</b>
+ * If the input token is non relevant a null is returned.
+ */
@Override
- public String[] disambiguate(String[] inputText, int inputWordIndex) {
- WTDLesk wtd = new WTDLesk(inputText, inputWordIndex);
+ public String[] disambiguate(String[] tokenizedContext, int
ambiguousTokenIndex) {
+
+ WTDLesk wtd = new WTDLesk(tokenizedContext, ambiguousTokenIndex);
+ // if the word is not relevant return null
+ if (!Constants.isRelevant(wtd.getPosTag())){
+ return null ;
+ }
+
ArrayList<WordSense> wsenses = null;
switch (this.params.leskType) {
@@ -654,15 +921,32 @@ public class Lesk implements WSDisambigu
LeskParameters.DFLT_DEXP, true, true, true, true, true);
Collections.sort(wsenses);
+ // TODO modify to longs but for now we have strings in the data for
coarsing
String[] senses = new String[wsenses.size()];
for (int i = 0; i < wsenses.size(); i++) {
- senses[i] = wsenses.get(i).getSense();
+ senses[i] = Constants.getPOS(wsenses.get(i).getWTDLesk().getPosTag())
+ .getKey()
+ + "@"
+ + Long.toString(wsenses.get(i).getNode().getSenseID())
+ + "@"
+ + wsenses.get(i).getScore();
}
return senses;
}
- @Override
- public String[] disambiguate(String[] inputText, Span[] inputWordSpans) {
+
+ /**
+ * Disambiguates an ambiguous word in its context
+ * The user can set a span of inputWords from the tokenized input
+ *
+ * @param inputText
+ * @param inputWordSpans
+ * @return array of array of sense indexes from WordNet ordered by their
score.
+ * The result format is <b>POS</b>@<b>SenseID</b>@<b>Sense Score</b>
+ * If the input token is non relevant a null is returned.
+ */
+ @Override
+ public String[][] disambiguate(String[] tokenizedContext, Span[]
ambiguousTokenSpans) {
// TODO need to work on spans
return null;
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
Thu Jul 2 16:08:55 2015
@@ -19,11 +19,23 @@
package opennlp.tools.disambiguator.lesk;
-public class LeskParameters {
+import opennlp.tools.disambiguator.WSDParameters;
- // VARIATIONS
+/**
+ * Lesk specific parameter set
+ *
+ */
+public class LeskParameters extends WSDParameters {
+
+
+ /**
+ * Enum of all types of implemented variations of Lesk
+ *
+ */
public static enum LESK_TYPE {
- LESK_BASIC, LESK_BASIC_CTXT, LESK_BASIC_CTXT_WIN, LESK_BASIC_CTXT_WIN_BF,
LESK_EXT, LESK_EXT_CTXT, LESK_EXT_CTXT_WIN, LESK_EXT_CTXT_WIN_BF, LESK_EXT_EXP,
LESK_EXT_EXP_CTXT, LESK_EXT_EXP_CTXT_WIN, LESK_EXT_EXP_CTXT_WIN_BF,
+ LESK_BASIC, LESK_BASIC_CTXT, LESK_BASIC_CTXT_WIN, LESK_BASIC_CTXT_WIN_BF,
+ LESK_EXT, LESK_EXT_CTXT, LESK_EXT_CTXT_WIN, LESK_EXT_CTXT_WIN_BF,
LESK_EXT_EXP,
+ LESK_EXT_EXP_CTXT, LESK_EXT_EXP_CTXT_WIN, LESK_EXT_EXP_CTXT_WIN_BF,
}
// DEFAULTS
@@ -33,25 +45,124 @@ public class LeskParameters {
protected static final double DFLT_IEXP = 0.3;
protected static final double DFLT_DEXP = 0.3;
- public LESK_TYPE leskType;
- public int win_f_size;
- public int win_b_size;
- public int depth;
-
- public boolean fathom_synonyms;
- public boolean fathom_hypernyms;
- public boolean fathom_hyponyms;
- public boolean fathom_meronyms;
- public boolean fathom_holonyms;
-
- public double depth_weight;
- public double iexp;
- public double dexp;
+ protected LESK_TYPE leskType;
+ protected int win_f_size;
+ protected int win_b_size;
+ protected int depth;
+
+ protected boolean fathom_synonyms;
+ protected boolean fathom_hypernyms;
+ protected boolean fathom_hyponyms;
+ protected boolean fathom_meronyms;
+ protected boolean fathom_holonyms;
+
+ protected double depth_weight;
+ protected double iexp;
+ protected double dexp;
+
+ public LESK_TYPE getLeskType() {
+ return leskType;
+ }
+
+ public void setLeskType(LESK_TYPE leskType) {
+ this.leskType = leskType;
+ }
+
+ public int getWin_f_size() {
+ return win_f_size;
+ }
+
+ public void setWin_f_size(int win_f_size) {
+ this.win_f_size = win_f_size;
+ }
+
+ public int getWin_b_size() {
+ return win_b_size;
+ }
+
+ public void setWin_b_size(int win_b_size) {
+ this.win_b_size = win_b_size;
+ }
+
+ public int getDepth() {
+ return depth;
+ }
+
+ public void setDepth(int depth) {
+ this.depth = depth;
+ }
+
+ public boolean isFathom_synonyms() {
+ return fathom_synonyms;
+ }
+
+ public void setFathom_synonyms(boolean fathom_synonyms) {
+ this.fathom_synonyms = fathom_synonyms;
+ }
+
+ public boolean isFathom_hypernyms() {
+ return fathom_hypernyms;
+ }
+
+ public void setFathom_hypernyms(boolean fathom_hypernyms) {
+ this.fathom_hypernyms = fathom_hypernyms;
+ }
+
+ public boolean isFathom_hyponyms() {
+ return fathom_hyponyms;
+ }
+
+ public void setFathom_hyponyms(boolean fathom_hyponyms) {
+ this.fathom_hyponyms = fathom_hyponyms;
+ }
+
+ public boolean isFathom_meronyms() {
+ return fathom_meronyms;
+ }
+
+ public void setFathom_meronyms(boolean fathom_meronyms) {
+ this.fathom_meronyms = fathom_meronyms;
+ }
+
+ public boolean isFathom_holonyms() {
+ return fathom_holonyms;
+ }
+
+ public void setFathom_holonyms(boolean fathom_holonyms) {
+ this.fathom_holonyms = fathom_holonyms;
+ }
+
+ public double getDepth_weight() {
+ return depth_weight;
+ }
+
+ public void setDepth_weight(double depth_weight) {
+ this.depth_weight = depth_weight;
+ }
+
+ public double getIexp() {
+ return iexp;
+ }
+
+ public void setIexp(double iexp) {
+ this.iexp = iexp;
+ }
+
+ public double getDexp() {
+ return dexp;
+ }
+
+ public void setDexp(double dexp) {
+ this.dexp = dexp;
+ }
public LeskParameters() {
this.setDefaults();
}
+ /**
+ * Sets default parameters
+ */
public void setDefaults() {
this.leskType = LeskParameters.DFLT_LESK_TYPE;
this.win_f_size = LeskParameters.DFLT_WIN_SIZE;
@@ -66,8 +177,10 @@ public class LeskParameters {
this.fathom_synonyms = true;
}
- // Parameter Validation
- // TODO make isSet for semantic feature booleans
+
+ /* (non-Javadoc)
+ * @see opennlp.tools.disambiguator.WSDParameters#isValid()
+ */
public boolean isValid() {
switch (this.leskType) {
@@ -81,16 +194,13 @@ public class LeskParameters {
case LESK_EXT:
case LESK_EXT_CTXT:
return (this.depth >= 0) && (this.depth_weight >= 0);
-
case LESK_EXT_CTXT_WIN:
case LESK_EXT_CTXT_WIN_BF:
return (this.depth >= 0) && (this.depth_weight >= 0)
&& (this.win_b_size >= 0) && (this.win_f_size >= 0);
-
case LESK_EXT_EXP:
case LESK_EXT_EXP_CTXT:
return (this.depth >= 0) && (this.dexp >= 0) && (this.iexp >= 0);
-
case LESK_EXT_EXP_CTXT_WIN:
case LESK_EXT_EXP_CTXT_WIN_BF:
return (this.depth >= 0) && (this.dexp >= 0) && (this.iexp >= 0)
Modified:
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java?rev=1688852&r1=1688851&r2=1688852&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
Thu Jul 2 16:08:55 2015
@@ -45,40 +45,47 @@ public class Tester {
try {
TokenizerModel = new TokenizerModel(new FileInputStream(
-
"src\\test\\resources\\opennlp\\tools\\disambiguator\\en-token.bin"));
+ "src\\test\\resources\\models\\en-token.bin"));
Tokenizer tokenizer = new TokenizerME(TokenizerModel);
String[] words = tokenizer.tokenize(sentence);
-
- POSModel posTaggerModel = new POSModelLoader()
- .load(new File(
-
"src\\test\\resources\\opennlp\\tools\\disambiguator\\en-pos-maxent.bin"));
- POSTagger tagger = new POSTaggerME(posTaggerModel);
-
- Constants.print("\ntokens :");
+//
+// POSModel posTaggerModel = new POSModelLoader()
+// .load(new File(
+// "src\\test\\resources\\models\\en-pos-maxent.bin"));
+//// POSTagger tagger = new POSTaggerME(posTaggerModel);
+//
+// Constants.print("\ntokens :");
Constants.print(words);
- Constants.print(tagger.tag(words));
+
+ int wordIndex= 6;
+// Constants.print(tagger.tag(words));
Constants.print("\ntesting default lesk :");
Lesk lesk = new Lesk();
- Constants.print(lesk.disambiguate(words, 6));
+ Constants.print(lesk.disambiguate(words, wordIndex));
+ Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
+
Constants.print("\ntesting with null params :");
lesk.setParams(null);
- Constants.print(lesk.disambiguate(words, 6));
+ Constants.print(lesk.disambiguate(words, wordIndex));
+ Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
Constants.print("\ntesting with default params");
lesk.setParams(new LeskParameters());
- Constants.print(lesk.disambiguate(words, 6));
+ Constants.print(lesk.disambiguate(words, wordIndex));
+ Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
Constants.print("\ntesting with custom params :");
LeskParameters leskParams = new LeskParameters();
- leskParams.leskType = LeskParameters.LESK_TYPE.LESK_BASIC_CTXT_WIN_BF;
- leskParams.win_b_size = 4;
- leskParams.depth = 3;
+ leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_BASIC_CTXT_WIN_BF);
+ leskParams.setWin_b_size(4);
+ leskParams.setDepth(3);
lesk.setParams(leskParams);
- Constants.print(lesk.disambiguate(words, 6));
-
+ Constants.print(lesk.disambiguate(words, wordIndex));
+ Constants.printResults(lesk,lesk.disambiguate(words, wordIndex));
+
/*
* Constants.print("\ntesting with wrong params should throw exception
:");
* LeskParameters leskWrongParams = new LeskParameters();