Author: joern
Date: Mon Aug  3 08:11:04 2015
New Revision: 1693857

URL: http://svn.apache.org/r1693857
Log:
OPENNLP-802 The WSDisambiguator needs a baseline to compare the implemented 
approaches with.
Lesk presents a good baseline, however Senseval and Semeval workshops 
demonstrated that MFS presents a better and more challenging baseline.

Thanks to Mondher Bouazizi for providing a patch!

Added:
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
   (with props)
Modified:
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java?rev=1693857&r1=1693856&r2=1693857&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
 Mon Aug  3 08:11:04 2015
@@ -25,6 +25,8 @@ import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Collections;
+import java.util.Arrays;
 
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
@@ -209,6 +211,8 @@ public class SensevalReader {
                 ArrayList<String> answers = new ArrayList<String>();
                 String sentence = "";
                 String rawWord = "";
+                String[] finalText = null;
+                int index = 0;
 
                 NodeList nChildren = nInstance.getChildNodes();
 
@@ -230,18 +234,46 @@ public class SensevalReader {
                     sentence = ((Element) nChild).getTextContent();
 
                     if (nChild.hasChildNodes()) {
-                      // textbefore =
-                      // nChild.getChildNodes().item(0).getTextContent();
+                      String textBefore = nChild.getChildNodes().item(0)
+                          .getTextContent();
                       rawWord = 
nChild.getChildNodes().item(1).getTextContent();
-                      // textAfter =
-                      // nChild.getChildNodes().item(2).getTextContent();
+                      String textAfter = nChild.getChildNodes().item(2)
+                          .getTextContent();
+
+                      ArrayList<String> textBeforeTokenzed = new 
ArrayList<String>(
+                          Arrays.asList(textBefore.split("\\s")));
+                      ArrayList<String> textAfterTokenzed = new 
ArrayList<String>(
+                          Arrays.asList(textAfter.split("\\s")));
+
+                      
textBeforeTokenzed.removeAll(Collections.singleton(null));
+                      textBeforeTokenzed.removeAll(Collections.singleton(""));
+
+                      textAfterTokenzed.removeAll(Collections.singleton(null));
+                      textAfterTokenzed.removeAll(Collections.singleton(""));
+
+                      finalText = new String[textBeforeTokenzed.size() + 1
+                          + textAfterTokenzed.size()];
+
+                      int l = 0;
+                      for (String tempWord : textBeforeTokenzed) {
+                        finalText[l] = tempWord;
+                        l++;
+                      }
+                      index = l;
+                      finalText[l] = rawWord.toLowerCase();
+                      l++;
+                      for (String tempWord : textAfterTokenzed) {
+                        finalText[l] = tempWord;
+                        l++;
+                      }
+
                     }
                   }
 
                 }
 
-                WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
-                    rawWord);
+                WTDIMS wordToDisambiguate = new WTDIMS(finalText, index,
+                    answers);
                 setInstances.add(wordToDisambiguate);
               }
             }

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java?rev=1693857&r1=1693856&r2=1693857&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
 Mon Aug  3 08:11:04 2015
@@ -67,6 +67,11 @@ public class WTDIMS extends WordToDisamb
     super(wtd.getSentence(), wtd.getWordIndex(), wtd.getSense());
     this.senseIDs = wtd.getSenseIDs();
   }
+  
+  public WTDIMS(String[] sentence, int wordIndex, ArrayList<String> senseIDs) {
+    super(sentence, wordIndex);
+    this.senseIDs = senseIDs;
+  }
 
   public String[] getPosOfSurroundingWords() {
     return posOfSurroundingWords;

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java?rev=1693857&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
 Mon Aug  3 08:11:04 2015
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.mfs;
+
+import java.security.InvalidParameterException;
+import java.util.ArrayList;
+
+import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.POS;
+import net.sf.extjwnl.data.Synset;
+import net.sf.extjwnl.data.Word;
+import opennlp.tools.disambiguator.Constants;
+import opennlp.tools.disambiguator.WSDParameters;
+import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.WordPOS;
+import opennlp.tools.disambiguator.WordToDisambiguate;
+import opennlp.tools.util.Span;
+
+/**
+ * Implementation of the <b>Most Frequent Sense</b> baseline approach. This
+ * approach returns the first sense retreived in WordNet which is supposed to 
be
+ * the most frequent sense:
+ * <ul>
+ * <li>PoS-tags of the surrounding words</li>
+ * <li>Local collocations</li>
+ * <li>Surrounding words</li>
+ * </ul>
+ * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
+ * about this approach
+ */
+public class MFS implements WSDisambiguator {
+
+  public MFS(WSDParameters parameters) {
+    super();
+    this.parameters = parameters;
+  }
+
+  public MFS() {
+    super();
+    this.parameters = new MFSParameters();
+  }
+
+  public WSDParameters parameters;
+
+  private String[] getMostFrequentSense(WordToDisambiguate wordToDisambiguate) 
{
+
+    String word = wordToDisambiguate.getRawWord().toLowerCase();
+    POS pos = Constants.getPOS(wordToDisambiguate.getPosTag());
+
+    if (pos != null) {
+
+      WordPOS wordPOS = new WordPOS(word, pos);
+
+      ArrayList<Synset> synsets = wordPOS.getSynsets();
+
+      int size = synsets.size();
+
+      String[] senses = new String[size];
+
+      for (int i = 0; i < size; i++) {
+        String senseKey = null;
+        for (Word wd : synsets.get(i).getWords()) {
+          if (wd.getLemma().equals(
+              wordToDisambiguate.getRawWord().split("\\.")[0])) {
+            try {
+              senseKey = wd.getSenseKey();
+            } catch (JWNLException e) {
+              e.printStackTrace();
+            }
+            senses[i] = "WordNet " + senseKey;
+            break;
+          }
+        }
+
+      }
+      return senses;
+    } else {
+      System.out.println("The word has no definitions in WordNet !");
+      return null;
+    }
+
+  }
+
+  /**
+   * This method returns the most frequent sense out of a wordTag. It serves 
for
+   * quick check of the most frequent sense without any need to create a
+   * {@link WordToDisambiguate} instance
+   * 
+   * @param wordTag
+   *          the word to disambiguate. It should be written in the format
+   *          "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
+   * @return The most frequent sense if it exists in WordNet, null} otherwise
+   */
+  public String[] getMostFrequentSense(String wordTag) {
+
+    String word = wordTag.split("\\.")[0];
+    String tag = wordTag.split("\\.")[1];
+
+    POS pos;
+
+    if (tag.equalsIgnoreCase("a")) {
+      pos = POS.ADJECTIVE;
+    } else if (tag.equalsIgnoreCase("r")) {
+      pos = POS.ADVERB;
+    } else if (tag.equalsIgnoreCase("n")) {
+      pos = POS.NOUN;
+    } else if (tag.equalsIgnoreCase("a")) {
+      pos = POS.VERB;
+    } else
+      pos = null;
+
+    if (pos != null) {
+
+      WordPOS wordPOS = new WordPOS(word, pos);
+
+      ArrayList<Synset> synsets = wordPOS.getSynsets();
+
+      int size = synsets.size();
+
+      String[] senses = new String[size];
+
+      for (int i = 0; i < size; i++) {
+        String senseKey = null;
+        for (Word wd : synsets.get(i).getWords()) {
+          if (wd.getLemma().equals(word)) {
+            try {
+              senseKey = wd.getSenseKey();
+            } catch (JWNLException e) {
+              e.printStackTrace();
+            }
+            senses[i] = senseKey;
+            break;
+          }
+        }
+
+      }
+      return senses;
+    } else {
+      System.out.println("The word has no definitions in WordNet !");
+      return null;
+    }
+
+  }
+
+  @Override
+  public WSDParameters getParams() {
+    return this.parameters;
+  }
+
+  @Override
+  public void setParams(WSDParameters params) throws InvalidParameterException 
{
+    this.parameters = params;
+
+  }
+
+  @Override
+  public String[] disambiguate(String[] tokenizedContext,
+      int ambiguousTokenIndex) {
+    // System.out.println(tokenizedContext[ambiguousTokenIndex]);
+    WordToDisambiguate wtd = new WordToDisambiguate(tokenizedContext,
+        ambiguousTokenIndex);
+    // System.out.println(wtd.getPosTags()[ambiguousTokenIndex]);
+    return getMostFrequentSense(wtd);
+  }
+
+  @Override
+  public String[][] disambiguate(String[] tokenizedContext,
+      Span[] ambiguousTokenIndexSpans) {
+
+    // TODO Auto-generated method stub
+    return null;
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java?rev=1693857&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
 Mon Aug  3 08:11:04 2015
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.mfs;
+
+import opennlp.tools.disambiguator.WSDParameters;
+
+public class MFSParameters extends WSDParameters {
+
+  public MFSParameters(){
+    this.isCoarseSense = false;
+    this.source = Source.WORDNET;
+  }
+  
+  public static enum Source {
+    WORDNET(1, "wordnet");
+
+    public int code;
+    public String src;
+
+    private Source(int code, String src) {
+      this.code = code;
+      this.src = src;
+    }
+  }
+
+  protected Source source;
+
+  public Source getSource() {
+    return source;
+  }
+
+  public void setSource(Source source) {
+    this.source = source;
+  }
+
+  @Override
+  public boolean isValid() {
+    if (this.source.code == 1) {
+      return true;
+    }
+    return false;
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java?rev=1693857&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
 Mon Aug  3 08:11:04 2015
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.ims.WTDIMS;
+import opennlp.tools.disambiguator.mfs.MFS;
+import opennlp.tools.disambiguator.mfs.MFSParameters;
+
+import org.junit.Test;
+
+public class MFSEvaluatorTest {
+
+  static SensevalReader seReader = new SensevalReader();
+
+  @Test
+  public static void main(String[] args) {
+    Constants.print("Evaluation Started");
+
+    MFS mfs = new MFS();
+    MFSParameters mfsParams = new MFSParameters();
+    mfs.setParams(mfsParams);
+
+    ArrayList<String> words = seReader.getSensevalWords();
+
+    for (String word : words) {
+      WSDEvaluator evaluator = new WSDEvaluator(mfs);
+
+      // don't take verbs because they are not from WordNet
+      if (!word.split("\\.")[1].equals("v")) {
+
+        ArrayList<WTDIMS> instances = getTestData(word);
+
+        if (instances != null) {
+          Constants.print("------------------" + word + "------------------");
+          for (WordToDisambiguate instance : instances) {
+
+            if (instance.getSenseIDs() != null
+                && !instance.getSenseIDs().get(0).equals("null")) {
+              // Constants.print("sense IDs : " + instance.senseIDs);
+              evaluator.evaluateSample(instance);
+            }
+          }
+          Constants.print(evaluator.toString());
+        } else {
+          Constants.print("null instances");
+        }
+      }
+
+    }
+
+  }
+
+  /**
+   * For a specific word, return the Semeval3 corresponding instances in form 
of
+   * {@link WSDIMS}
+   * 
+   * @param wordTag
+   *          the word of which the instances are to be collected. wordTag has
+   *          to be in the format "word.POS" (e.g., "activate.v", "smart.a",
+   *          etc.)
+   * @return list of {@link WSDIMS} instances of the wordTag
+   */
+  protected static ArrayList<WTDIMS> getTestData(String wordTag) {
+
+    ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>();
+    for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
+      WTDIMS wtdims = new WTDIMS(wtd);
+      if (wtdims != null) {
+        if (wtdims.getSenseIDs().get(0) != null
+            && !wtdims.getSenseIDs().get(0).equalsIgnoreCase("U")) {
+          // System.out.println(wtdims.getRawWord() + " - " +
+          // wtdims.getPosTags() + " - " + wtdims.getSenseIDs().get(0));
+          instances.add(wtdims);
+        }
+      }
+
+    }
+
+    return instances;
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java?rev=1693857&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
 Mon Aug  3 08:11:04 2015
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import opennlp.tools.disambiguator.mfs.MFS;
+
+/**
+ * This is a typical example of how to call the disambiguation function in the
+ * MFS class.
+ */
+public class MFSTester {
+
+  public static void main(String[] args) {
+
+    MFS mfs = new MFS();
+
+    String test1 = "Please write to me soon.";
+    String[] sentence1 = Loader.getTokenizer().tokenize(test1);
+    Constants.print(mfs.disambiguate(sentence1, 1));
+
+    String test2 = "it was a strong argument that his hypothesis was true";
+    String[] sentence2 = Loader.getTokenizer().tokenize(test2);
+    Constants.print(mfs.disambiguate(sentence2, 3));
+
+    String test3 = "the component was highly radioactive to the point that it 
has been activated the second it touched water";
+    String[] sentence3 = Loader.getTokenizer().tokenize(test3);
+    Constants.print(mfs.disambiguate(sentence3, 12));
+
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain


Reply via email to