disambigua...

joern Tue, 04 Aug 2015 00:49:30 -0700

Author: joern
Date: Tue Aug  4 07:48:48 2015
New Revision: 1694008

URL: http://svn.apache.org/r1694008
Log:
OPENNLP-758 Updated Lesk with new data readers and added MFS in case no 
overlaps are found (similar to the simplified version). Thanks to Anthony 
Beylerian for providing a patch.


Added:
    opennlp/sandbox/opennlp-wsd/.project   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
   (with props)
Removed:
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
Modified:
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java

Added: opennlp/sandbox/opennlp-wsd/.project
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/.project?rev=1694008&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/.project (added)
+++ opennlp/sandbox/opennlp-wsd/.project Tue Aug  4 07:48:48 2015
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+       <name>opennlp-wsd</name>
+       <comment></comment>
+       <projects>
+       </projects>
+       <buildSpec>
+       </buildSpec>
+       <natures>
+       </natures>
+</projectDescription>

Propchange: opennlp/sandbox/opennlp-wsd/.project
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
 Tue Aug  4 07:48:48 2015
@@ -138,33 +138,48 @@ public class Constants {
           "you're", "yours", "yourself", "yourselves", "you've", "zero"));
 
   // Print a text in the console
-  public static void printResults(WSDisambiguator disambiguator,
-      String[] results) {
+//Print a text in the console
+ public static void printResults(WSDisambiguator disambiguator,
+     String[] results) {
+
+   if (results != null) {
+
+     String[] parts;
+     String sensekey;
+     if (disambiguator instanceof Lesk) {
+
+       Double score;
+
+       for (String result : results) {
+         parts = result.split(" ");
+         sensekey = parts[1];
+         score = Double.parseDouble(parts[2]);
+         try {
+           Constants.print("score : "
+               + score
+               + " for : "
+               + Loader.getDictionary().getWordBySenseKey(sensekey)
+                   .getSynset().getGloss());
+         } catch (JWNLException e) {
+           e.printStackTrace();
+         }
+       }
+     } else {
+       for (String result : results) {
+         parts = result.split(" ");
+         sensekey = parts[1];
+         try {
+           Constants.print("sense : "
+               + Loader.getDictionary().getWordBySenseKey(sensekey)
+                   .getSynset().getGloss());
+         } catch (JWNLException e) {
+           e.printStackTrace();
+         }
+       }
+     }
+   }
 
-    if (results != null) {
-
-      if (disambiguator instanceof Lesk) {
-        POS pos;
-        long offset;
-        double score;
-        String[] parts;
-
-        for (String result : results) {
-          parts = result.split("@");
-          pos = POS.getPOSForKey(parts[0]);
-          offset = Long.parseLong(parts[1]);
-          score = Double.parseDouble(parts[3]);
-          try {
-            Constants.print("score : " + score + " for : "
-                + Loader.getDictionary().getSynsetAt(pos, offset).getGloss());
-          } catch (JWNLException e) {
-            e.printStackTrace();
-          }
-        }
-      }
-    }
-
-  }
+ }
 
   public static void print(Object in) {
     if (in == null) {

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
 Tue Aug  4 07:48:48 2015
@@ -30,7 +30,7 @@ import net.sf.extjwnl.data.POS;
 import net.sf.extjwnl.dictionary.Dictionary;
 import net.sf.extjwnl.dictionary.MorphologicalProcessor;
 import opennlp.tools.cmdline.postag.POSModelLoader;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
 import opennlp.tools.lemmatizer.SimpleLemmatizer;
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinderModel;

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java?rev=1694008&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
 Tue Aug  4 07:48:48 2015
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.util.ArrayList;
+
+public class Paragraph {
+
+  protected int pnum;
+  protected ArrayList<Sentence> isentences;
+
+  public Paragraph() {
+    super();
+    this.isentences = new ArrayList<Sentence>();
+  }
+
+  public Paragraph(int pnum) {
+    super();
+    this.pnum = pnum;
+    this.isentences = new ArrayList<Sentence>();
+  }
+
+  public Paragraph(int pnum, ArrayList<Sentence> sentences) {
+    super();
+    this.pnum = pnum;
+    this.isentences = sentences;
+  }
+
+  public int getPnum() {
+    return pnum;
+  }
+
+  public void setPnum(int pnum) {
+    this.pnum = pnum;
+  }
+
+  public ArrayList<Sentence> getSsentences() {
+    return isentences;
+  }
+
+  public void setIsentences(ArrayList<Sentence> isentences) {
+    this.isentences = isentences;
+  }
+
+  public void addIsentence(Sentence isentence) {
+    this.isentences.add(isentence);
+  }
+
+  @Override
+  public String toString() {
+    String paragraph = "";
+    for (int i = 0; i < this.isentences.size(); i++) {
+      paragraph = paragraph + " " + this.isentences.get(i).toString();
+    }
+    return paragraph.substring(1, paragraph.length());
+
+  }
+
+  /**
+   * This return TRUE only and only if the paragraph contains the word and it 
is
+   * sense-tagged
+   * 
+   * @param wordTag
+   * @return {@value Boolean.true} if the word exists in the paragraph and is
+   *         sense-tagged
+   * 
+   */
+  public boolean contains(String wordTag) {
+
+    for (Sentence isentence : this.getSsentences()) {
+      for (Word iword : isentence.getIwords()) {
+        if (iword.equals(iword))
+          return true;
+      }
+    }
+
+    return false;
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java?rev=1694008&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
 Tue Aug  4 07:48:48 2015
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * This reads one semcor file. It requires the
+ *
+ */
+public class SemcorReaderExtended {
+
+  private static final String ELEMENT_CONTEXTFILE = "contextfile";
+  private static final String ATTRIBUTE_CONCORDANCE = "concordance";
+
+  private static final String ELEMENT_CONTEXT = "context";
+  private static final String ATTRIBUTE_FILENAME = "filename";
+  private static final String ATTRIBUTE_PARAS = "paras";
+
+  private static final String ELEMENT_PARAGRAPH = "p";
+  private static final String ATTRIBUTE_PARAGRAPHNUM = "pnum";
+
+  private static final String ELEMENT_SENTENCE = "s";
+  private static final String ATTRIBUTE_SENTENCENUM = "snum";
+
+  private static final String ELEMENT_WORDFORM = "wf";
+  private static final String ATTRIBUTE_CMD = "cmd";
+  private static final String ATTRIBUTE_RDF = "rdf";
+  private static final String ATTRIBUTE_POS = "pos";
+  private static final String ATTRIBUTE_LEMMA = "lemma";
+  private static final String ATTRIBUTE_WNSN = "wnsn";
+  private static final String ATTRIBUTE_LEXSN = "lexsn";
+
+  private static final String ELEMENT_PUNCTUATION = "punc";
+
+  private static String path = "src\\test\\resources\\semcor3.0\\";
+  private static String[] folders = { "brown1", "brown2", "brownv" };
+  private static String tagfiles = "\\tagfiles\\";
+
+  public SemcorReaderExtended() {
+    super();
+  }
+
+  /**
+   * This serves to read one Semcor XML file
+   */
+  public ArrayList<Sentence> readFile(String file) {
+
+    ArrayList<Sentence> result = new ArrayList<Sentence>();
+
+    try {
+
+      File xmlFile = new File(file);
+      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+      Document doc = dBuilder.parse(xmlFile);
+
+      doc.getDocumentElement().normalize();
+
+      NodeList paragraphs = doc.getElementsByTagName(ELEMENT_PARAGRAPH);
+
+      for (int i = 0; i < paragraphs.getLength(); i++) {
+
+        Node nParagraph = paragraphs.item(i);
+
+        if (nParagraph.getNodeType() == Node.ELEMENT_NODE) {
+
+          Element eParagraph = (Element) nParagraph;
+          // THE PARAGRAPH ID
+          int paragraphID = Integer.parseInt(eParagraph
+              .getAttribute(ATTRIBUTE_PARAGRAPHNUM));
+
+          NodeList nSentences = nParagraph.getChildNodes();
+
+          for (int j = 1; j < nSentences.getLength(); j++) {
+
+            Node nSentence = nSentences.item(j);
+            if (nSentence.getNodeType() == Node.ELEMENT_NODE) {
+
+              Element eSentence = (Element) nSentence;
+              // THE SENTENCE ID
+              int sentenceID = Integer.parseInt(eSentence
+                  .getAttribute(ATTRIBUTE_SENTENCENUM));
+              Sentence isentence = new Sentence(paragraphID, sentenceID);
+
+              NodeList nWords = nSentence.getChildNodes();
+
+              int wnum = 0;
+              for (int k = 0; k < nWords.getLength(); k++) {
+                Node nWord = nWords.item(k);
+
+                if (nWord.getNodeType() == Node.ELEMENT_NODE) {
+
+                  if (nWord.getNodeName().equals(ELEMENT_WORDFORM)) {
+
+                    Element eWord = (Element) nWord;
+
+                    if (eWord.getAttribute(ATTRIBUTE_CMD).equals("done")) {
+                      // if the word is already disambiguated
+                      String word = eWord.getTextContent();
+                      String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
+                      String pos = eWord.getAttribute(ATTRIBUTE_POS);
+                      String lemma = eWord.getAttribute(ATTRIBUTE_LEMMA);
+                      String wnsn = eWord.getAttribute(ATTRIBUTE_WNSN);
+                      String lexsn = eWord.getAttribute(ATTRIBUTE_LEXSN);
+
+                      Word iword = new Word(paragraphID, sentenceID, wnum,
+                          Word.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn);
+                      isentence.addIword(iword);
+                      wnum++;
+
+                      // System.out.println("*** " + iword.toString() + " 
***");
+
+                    } else {
+                      // if the word is not disambiguated
+                      String word = eWord.getTextContent();
+                      String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
+                      String pos = eWord.getAttribute(ATTRIBUTE_POS);
+
+                      Word iword = new Word(paragraphID, sentenceID, wnum,
+                          Word.Type.WORD, word, cmd, pos);
+                      isentence.addIword(iword);
+                      wnum++;
+                    }
+
+                  } else if (nWord.getNodeName().equals(ELEMENT_PUNCTUATION)) {
+                    Element eWord = (Element) nWord;
+                    String word = eWord.getTextContent();
+                    Word iword = new Word(paragraphID, sentenceID, wnum,
+                        Word.Type.PUNCTUATIONMARK, word);
+                    isentence.addIword(iword);
+                    wnum++;
+                  }
+
+                }
+
+              }
+              result.add(isentence);
+            }
+          }
+        }
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    return result;
+  }
+
+  public ArrayList<WordToDisambiguate> getSemcorOneFileData(String file,
+      String wordTag) {
+
+    ArrayList<WordToDisambiguate> setInstances = new 
ArrayList<WordToDisambiguate>();
+
+    try {
+
+      ArrayList<Sentence> isentences = readFile(file);
+      for (int j = 0; j < isentences.size(); j++) {
+        Sentence isentence = isentences.get(j);
+        ArrayList<Word> iwords = isentence.getIwords();
+        for (int k = 0; k < iwords.size(); k++) {
+          Word iword = iwords.get(k);
+          if (iword.isInstanceOf(wordTag)) {
+
+            String sentence;
+            int index;
+
+            if (j == 0) {
+              // case of the first sentence, we consider the current sentence
+              // and the next two ones
+              sentence = isentences.get(j).toString() + " "
+                  + isentences.get(j + 1).toString() + " "
+                  + isentences.get(j + 2).toString();
+              index = k;
+            } else if (j == isentences.size() - 1) {
+              // case of the last sentence, we consider the current sentence 
and
+              // the previous two ones
+              sentence = isentences.get(j - 2).toString() + " "
+                  + isentences.get(j - 1).toString() + " "
+                  + isentences.get(j).toString();
+              index = isentences.get(j - 2).getIwords().size()
+                  + isentences.get(j - 1).getIwords().size() + k;
+            } else {
+              // case of a sentence in the middle, we consider the previous
+              // sentence + the current one + the next one
+              sentence = isentences.get(j - 1).toString() + " "
+                  + isentences.get(j).toString() + " "
+                  + isentences.get(j + 1).toString();
+              index = isentences.get(j - 1).getIwords().size() + k;
+            }
+            ArrayList<String> senses = new ArrayList<String>();
+            String sense = iword.getLexsn();
+            if (sense != null) {
+              senses.add(sense);
+            }
+
+            if (!senses.isEmpty()) {
+              WordToDisambiguate wtd = new WordToDisambiguate(
+                  sentence.split("\\s"), index, senses);
+              setInstances.add(wtd);
+            }
+
+          }
+        }
+
+      }
+
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    return setInstances;
+
+  }
+
+  /**
+   * One Semcor folder reader: This reads all the files in one semcor folder,
+   * and return all the instances in the format {@link WordToDisambiguate} of a
+   * specific word
+   * 
+   * @param folder
+   *          the name of the folder. Three folders exist in Semcor3.0, which
+   *          are ["brown1", "brown2", "brownv"]
+   * @param wordTag
+   *          The word, of which we are looking for the instances
+   * @return the list of the {@link WordToDisambiguate} instances
+   */
+  public ArrayList<WordToDisambiguate> getSemcorFolderData(String folder,
+      String wordTag) {
+
+    ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>();
+
+    String directory = path + folder + tagfiles;
+    File tempFolder = new File(directory);
+    File[] listOfFiles;
+
+    if (tempFolder.isDirectory()) {
+      listOfFiles = tempFolder.listFiles();
+      for (File file : listOfFiles) {
+
+        ArrayList<WordToDisambiguate> list = getSemcorOneFileData(directory
+            + file.getName(), wordTag);
+        result.addAll(list);
+      }
+    }
+
+    return result;
+
+  }
+
+  /**
+   * Semcor reader: This reads all the files in semcor, and return all the
+   * instances in the format {@link WordToDisambiguate} of a specific word
+   * 
+   * @param wordTag
+   *          The word, of which we are looking for the instances
+   * @return the list of the {@link WordToDisambiguate} instances of the word 
to
+   *         disambiguate
+   */
+  public ArrayList<WordToDisambiguate> getSemcorData(String wordTag) {
+
+    ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>();
+
+    for (String folder : folders) {
+      ArrayList<WordToDisambiguate> list = getSemcorFolderData(folder, 
wordTag);
+      result.addAll(list);
+    }
+
+    return result;
+
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java?rev=1694008&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
 Tue Aug  4 07:48:48 2015
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Collections;
+import java.util.Arrays;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+import opennlp.tools.disambiguator.ims.WTDIMS;
+
+/**
+ * This class handles the extraction of Senseval-3 data from the different 
files
+ * (training data, dictionary instances, etc.)
+ */
+public class SensevalReader {
+
+  private String resourcesFolder = "src\\test\\resources\\";
+  protected String sensevalDirectory = resourcesFolder + "senseval3\\";
+
+  protected String data = sensevalDirectory + "EnglishLS.train";
+  protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
+  protected String wordList = sensevalDirectory + "EnglishLS.train.key";
+
+  // protected String dict = sensevalDirectory + "EnglishLS.dictionary.xml";
+  // protected String map = sensevalDirectory + "EnglishLS.sensemap";
+
+  /**
+   * The XML file of Senseval presents some issues that need to be fixed first
+   */
+  private String fixXmlFile() {
+
+    // TODO fix this !
+
+    return null;
+  }
+
+  public SensevalReader() {
+    super();
+  }
+
+  /**
+   * This extracts the equivalent senses. This serves in the case of the
+   * coarse-grained disambiguation
+   * 
+   * @param sensemapFile
+   *          the file containing the equivalent senses, each set of equivalent
+   *          senses per line
+   * @return a {@link HashMap} conaining the new sense ID ({@link Integer}) and
+   *         an {@link ArrayList} of the equivalent senses original IDs
+   */
+  public HashMap<Integer, ArrayList<String>> getEquivalentSense() {
+
+    HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, 
ArrayList<String>>();
+
+    try (BufferedReader wordsList = new BufferedReader(new FileReader(
+        sensemapFile))) {
+
+      int index = 0;
+
+      String line;
+
+      while ((line = wordsList.readLine()) != null) {
+
+        String[] temp = line.split("\\s");
+
+        ArrayList<String> tempSenses = new ArrayList<String>();
+
+        for (String sense : temp) {
+          if (sense.length() > 1) {
+            tempSenses.add(sense);
+          }
+        }
+
+        mappedSenses.put(index, tempSenses);
+        index++;
+
+      }
+
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    return mappedSenses;
+
+  }
+
+  /**
+   * This returns the list of words available in the Senseval data
+   * 
+   * @return {@link ArrayList} of the words available on the current Senseval
+   *         set
+   */
+  public ArrayList<String> getSensevalWords() {
+
+    ArrayList<String> wordTags = new ArrayList<String>();
+
+    try (BufferedReader br = new BufferedReader(new FileReader(wordList))) {
+
+      String line;
+
+      while ((line = br.readLine()) != null) {
+
+        String word = line.split("\\s")[0];
+
+        if (!wordTags.contains(word)) {
+          wordTags.add(word);
+        }
+
+      }
+
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    return wordTags;
+
+  }
+
+  /**
+   * Main Senseval Reader: This checks if the data corresponding to the words 
to
+   * disambiguate exist in the folder, and extract the
+   * {@link WordToDisambiguate} instances
+   * 
+   * @param wordTag
+   *          The word, of which we are looking for the instances
+   * @return the list of the {@link WordToDisambiguate} instances of the word 
to
+   *         disambiguate
+   */
+  public ArrayList<WordToDisambiguate> getSensevalData(String wordTag) {
+
+    ArrayList<WordToDisambiguate> setInstances = new 
ArrayList<WordToDisambiguate>();
+
+    try {
+
+      File xmlFile = new File(data);
+      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+      Document doc = dBuilder.parse(xmlFile);
+
+      doc.getDocumentElement().normalize();
+
+      NodeList lexelts = doc.getElementsByTagName("lexelt");
+
+      for (int i = 0; i < lexelts.getLength(); i++) {
+
+        Node nLexelt = lexelts.item(i);
+
+        if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
+          Element eLexelt = (Element) nLexelt;
+
+          if (eLexelt.getAttribute("item").equals(wordTag)) {
+
+            NodeList nInstances = nLexelt.getChildNodes();
+
+            for (int j = 1; j < nInstances.getLength(); j++) {
+
+              Node nInstance = nInstances.item(j);
+
+              if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
+
+                Element eInstance = (Element) nInstance;
+
+                String[] wordPos = eLexelt.getAttribute("item").split("\\.");
+                String word = wordPos[0]; // Word
+                String tag; // Part of Speech
+
+                if (wordPos[1].equals("n")) {
+                  tag = "noun";
+                } else if (wordPos[1].equals("v")) {
+                  tag = "verb";
+                } else if (wordPos[1].equals("a")) {
+                  tag = "adjective";
+                } else {
+                  tag = "adverb";
+                }
+
+                String id = eInstance.getAttribute("id");
+                String source = eInstance.getAttribute("docsrc");
+
+                ArrayList<String> answers = new ArrayList<String>();
+                String sentence = "";
+                String rawWord = "";
+                String[] finalText = null;
+                int index = 0;
+
+                NodeList nChildren = nInstance.getChildNodes();
+
+                for (int k = 1; k < nChildren.getLength(); k++) {
+                  Node nChild = nChildren.item(k);
+
+                  if (nChild.getNodeName().equals("answer")) {
+                    // String answer =
+                    // nChild.getAttributes().item(0).getTextContent();
+                    String senseid = nChild.getAttributes().item(1)
+                        .getTextContent();
+
+                    String temp = senseid;
+                    // String[] temp = { answer, senseid };
+                    answers.add(temp);
+                  }
+
+                  if (nChild.getNodeName().equals("context")) {
+                    sentence = ((Element) nChild).getTextContent();
+
+                    if (nChild.hasChildNodes()) {
+                      String textBefore = nChild.getChildNodes().item(0)
+                          .getTextContent();
+                      rawWord = 
nChild.getChildNodes().item(1).getTextContent();
+                      String textAfter = nChild.getChildNodes().item(2)
+                          .getTextContent();
+
+                      ArrayList<String> textBeforeTokenzed = new 
ArrayList<String>(
+                          Arrays.asList(textBefore.split("\\s")));
+                      ArrayList<String> textAfterTokenzed = new 
ArrayList<String>(
+                          Arrays.asList(textAfter.split("\\s")));
+
+                      
textBeforeTokenzed.removeAll(Collections.singleton(null));
+                      textBeforeTokenzed.removeAll(Collections.singleton(""));
+
+                      textAfterTokenzed.removeAll(Collections.singleton(null));
+                      textAfterTokenzed.removeAll(Collections.singleton(""));
+
+                      finalText = new String[textBeforeTokenzed.size() + 1
+                          + textAfterTokenzed.size()];
+
+                      int l = 0;
+                      for (String tempWord : textBeforeTokenzed) {
+                        finalText[l] = tempWord;
+                        l++;
+                      }
+                      index = l;
+                      finalText[l] = rawWord.toLowerCase();
+                      l++;
+                      for (String tempWord : textAfterTokenzed) {
+                        finalText[l] = tempWord;
+                        l++;
+                      }
+
+                    }
+                  }
+
+                }
+
+                WTDIMS wordToDisambiguate = new WTDIMS(finalText, index,
+                    answers);
+                setInstances.add(wordToDisambiguate);
+              }
+            }
+
+          }
+
+        }
+
+      }
+
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    return setInstances;
+
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java?rev=1694008&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
 Tue Aug  4 07:48:48 2015
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.util.ArrayList;
+
+public class Sentence {
+
+  protected int pnum;
+  protected int snum;
+  protected ArrayList<Word> iwords;
+
+  public Sentence() {
+    super();
+    this.iwords = new ArrayList<Word>();
+  }
+
+  public Sentence(int pnum, int snum) {
+    super();
+    this.pnum = pnum;
+    this.snum = snum;
+    this.iwords = new ArrayList<Word>();
+  }
+
+  public Sentence(int pnum, int snum, ArrayList<Word> iwords) {
+    super();
+    this.pnum = pnum;
+    this.snum = snum;
+    this.iwords = iwords;
+  }
+
+  public int getPnum() {
+    return pnum;
+  }
+
+  public void setPnum(int pnum) {
+    this.pnum = pnum;
+  }
+
+  public int getSnum() {
+    return snum;
+  }
+
+  public void setSnum(int snum) {
+    this.snum = snum;
+  }
+
+  public ArrayList<Word> getIwords() {
+    return iwords;
+  }
+
+  public void setIwords(ArrayList<Word> iwords) {
+    this.iwords = iwords;
+  }
+
+  public void addIword(Word iword) {
+    this.iwords.add(iword);
+  }
+
+  @Override
+  public String toString() {
+    String sentence = "";
+    for (int i = 0; i < this.iwords.size(); i++) {
+      sentence = sentence + " " + this.iwords.get(i).toString();
+    }
+    return sentence.substring(1, sentence.length());
+
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java?rev=1694008&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
 Tue Aug  4 07:48:48 2015
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import opennlp.tools.disambiguator.Constants;
+
+public class Word {
+
+  public static enum Type {
+    WORD(1, "word"), PUNCTUATIONMARK(2, "pm");
+
+    public int code;
+    public String type;
+
+    private Type(int code, String type) {
+      this.code = code;
+      this.type = type;
+    }
+  }
+
+  protected int pnum;
+  protected int snum;
+  protected int wnum;
+
+  // Type refers to the type of word in the sentence
+  protected Type type;
+
+  protected String word;
+  protected String cmd;
+  protected String pos;
+  protected String lemma;
+  protected String wnsn;
+  protected String lexsn;
+
+  public Word() {
+    super();
+  }
+
+  public Word(String lemma, String pos) {
+    super();
+    this.word = lemma;
+    this.lemma = lemma;
+    this.pos = pos;
+  }
+
+  /**
+   * This serves to create a DISAMBIGUATED word instance
+   * 
+   * @param pnum
+   *          id of the paragraph
+   * @param snum
+   *          id of the sentence
+   * @param wnum
+   *          id of the word in the sentence
+   * @param type
+   *          the type in this case is {@link Type.DWORD}
+   * @param word
+   *          The raw word, as it appears in the sentence
+   * @param cmd
+   *          Whether it is semantically disambiguated or not (or to be
+   *          disambiguated)
+   * @param pos
+   *          The PoS Tag of the word
+   * @param lemma
+   *          The lemma of the word
+   * @param wnsn
+   *          The integer sense number corresponding to the WordNet output
+   *          display
+   * @param lexsn
+   *          The "Sense_key" that indicates the WordNet sense to which word
+   *          should be linked
+   * 
+   */
+  public Word(int pnum, int snum, int wnum, Type type, String word,
+      String cmd, String pos, String lemma, String wnsn, String lexsn) {
+    super();
+    this.pnum = pnum;
+    this.snum = snum;
+    this.wnum = wnum;
+    this.type = type;
+    this.word = word;
+    this.cmd = cmd;
+    this.pos = pos;
+    this.lemma = lemma;
+    this.wnsn = wnsn;
+    this.lexsn = lexsn;
+  }
+
+  /**
+   * This serves to create a NON DISAMBIGUATED word instance
+   * 
+   * @param pnum
+   *          id of the paragraph
+   * @param snum
+   *          id of the sentence
+   * @param type
+   *          the type in this case is {@link Type.DWORD}
+   * @param word
+   *          The raw word, as it appears in the sentence
+   * @param cmd
+   *          Whether it is semantically disambiguated or not (or to be
+   *          disambiguated)
+   * @param pos
+   *          The PoS Tag of the word
+   * 
+   */
+  public Word(int pnum, int snum, int wnum, Type type, String word,
+      String cmd, String pos) {
+    super();
+    this.wnum = wnum;
+    this.pnum = pnum;
+    this.snum = snum;
+    this.type = type;
+    this.word = word;
+    this.cmd = cmd;
+    this.pos = pos;
+  }
+
+  /**
+   * This serves to create a punctuation instances
+   * 
+   * @param type
+   *          The type as in {@link Type}
+   * @param word
+   *          The punctuation mark, as it appears in the sentence
+   */
+  public Word(int pnum, int snum, int wnum, Type type, String word) {
+    super();
+    this.pnum = pnum;
+    this.snum = snum;
+    this.type = type;
+    this.word = word;
+  }
+
+  public int getPnum() {
+    return pnum;
+  }
+
+  public void setPnum(int pnum) {
+    this.pnum = pnum;
+  }
+
+  public int getSnum() {
+    return snum;
+  }
+
+  public void setSnum(int snum) {
+    this.snum = snum;
+  }
+
+  public int getWnum() {
+    return wnum;
+  }
+
+  public void setWnum(int wnum) {
+    this.wnum = wnum;
+  }
+
+  public String getWord() {
+    return word;
+  }
+
+  public void setWord(String word) {
+    this.word = word;
+  }
+
+  public Type getType() {
+    return type;
+  }
+
+  public void setType(Type type) {
+    this.type = type;
+  }
+
+  public String getCmd() {
+    return cmd;
+  }
+
+  public void setCmd(String cmd) {
+    this.cmd = cmd;
+  }
+
+  public String getPos() {
+    return pos;
+  }
+
+  public void setPos(String pos) {
+    this.pos = pos;
+  }
+
+  public String getLemma() {
+    return lemma;
+  }
+
+  public void setLemma(String lemma) {
+    this.lemma = lemma;
+  }
+
+  public String getWnsn() {
+    return wnsn;
+  }
+
+  public void setWnsn(String wnsn) {
+    this.wnsn = wnsn;
+  }
+
+  public String getLexsn() {
+    return lexsn;
+  }
+
+  public void setLexsn(String lexsn) {
+    this.lexsn = lexsn;
+  }
+
+  @Override
+  public String toString() {
+    return this.word;
+  }
+
+  public boolean equals(Object oword) {
+
+    if (!(oword instanceof Word))
+      return false;
+    if (oword == this)
+      return true;
+
+    Word iword = (Word) oword;
+
+    if (this.lemma != null && iword.getLemma() != null) {
+      if (iword.getLemma().equals(this.getLemma())
+          && Constants.getPOS(iword.getPos()).equals(
+              Constants.getPOS(this.getPos()))) {
+        return true;
+      }
+    } else {
+      if (this.word.equals(iword.getWord())
+          && Constants.getPOSabbreviation(this.getPos()).equals(
+              Constants.getPOSabbreviation(iword.getPos()))) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public boolean isInstanceOf(String wordTag) {
+
+    String tag = Constants.getPOSabbreviation(this.getPos());
+
+    String oword = wordTag.split("\\.")[0];
+    String otag = wordTag.split("\\.")[1];
+
+    if (this.lemma != null) {
+      if (this.lemma.equals(oword) && tag.equals(otag)) {
+        if (this.lexsn != null) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  public boolean senseEquals(Object oword) {
+
+    if (!(oword instanceof Word))
+      return false;
+    if (oword == this)
+      return true;
+
+    Word iword = (Word) oword;
+
+    if (iword.getLemma().equals(this.getLemma())
+        && Constants.getPOS(iword.getPos()).equals(
+            Constants.getPOS(this.getPos()))
+        && iword.getLexsn().equals(this.getLexsn())) {
+      return true;
+    }
+
+    return false;
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
 Tue Aug  4 07:48:48 2015
@@ -58,8 +58,8 @@ import opennlp.tools.disambiguator.WSDPa
 import opennlp.tools.disambiguator.WordPOS;
 import opennlp.tools.disambiguator.WSDisambiguator;
 import opennlp.tools.disambiguator.WordToDisambiguate;
-import opennlp.tools.disambiguator.DatasetsReader.SemcorReaderExtended;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
 
 /**
  * Implementation of the <b>It Makes Sense</b> approach originally proposed in

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
 Tue Aug  4 07:48:48 2015
@@ -33,6 +33,7 @@ import opennlp.tools.disambiguator.WordP
 import opennlp.tools.disambiguator.WordSense;
 import opennlp.tools.util.Span;
 import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.POS;
 import net.sf.extjwnl.data.Synset;
 import net.sf.extjwnl.data.Word;
 
@@ -95,6 +96,36 @@ public class Lesk implements WSDisambigu
     return params;
   }
 
+  /*
+   * @return the most frequent senses from wordnet
+   */
+  protected String getMostFrequentSenseKey(WTDLesk wtd) {
+
+    String word = wtd.getRawWord().toLowerCase();
+    POS pos = Constants.getPOS(wtd.getPosTag());
+    String senseKey = null;
+
+    if (pos != null) {
+
+      WordPOS wordPOS = new WordPOS(word, pos);
+
+      ArrayList<Synset> synsets = wordPOS.getSynsets();
+
+      for (Word wd : synsets.get(0).getWords()) {
+        if (wd.getLemma().equals(wtd.getRawWord().split("\\.")[0])) {
+          try {
+            senseKey = wd.getSenseKey();
+            break;
+          } catch (JWNLException e) {
+            e.printStackTrace();
+          }
+          break;
+        }
+      }
+    }
+    return senseKey;
+  }
+
   /**
    * The basic Lesk method where the entire context is considered for overlaps
    * 
@@ -980,23 +1011,30 @@ public class Lesk implements WSDisambigu
 
     Collections.sort(wsenses);
 
-    List<Word> synsetWords;
-    String[] senses = new String[wsenses.size()];
-    String senseKey = "?";
-    for (int i = 0; i < wsenses.size(); i++) {
-      synsetWords = wsenses.get(i).getNode().synset.getWords();
-      for (Word synWord : synsetWords) {
-        if (synWord.getLemma().equals(wtd.getWord())) {
-          try {
-            senseKey = synWord.getSenseKey();
-          } catch (JWNLException e) {
-            e.printStackTrace();
+    String[] senses;
+    if (wsenses.get(0).getScore() > 0) { // if at least one overlap
+      List<Word> synsetWords;
+      senses = new String[wsenses.size()];
+      String senseKey = "?";
+      for (int i = 0; i < wsenses.size(); i++) {
+        synsetWords = wsenses.get(i).getNode().synset.getWords();
+        for (Word synWord : synsetWords) {
+          if (synWord.getLemma().equals(wtd.getWord())) {
+            try {
+              senseKey = synWord.getSenseKey();
+            } catch (JWNLException e) {
+              e.printStackTrace();
+            }
+            break;
           }
-          break;
         }
-      }
-      senses[i] = "WordNet" + " " + senseKey + " " + wsenses.get(i).getScore();
+        senses[i] = "WordNet" + " " + senseKey + " "
+            + wsenses.get(i).getScore();
 
+      }
+    } else { // get the MFS if no overlaps
+      senses = new String[1];
+      senses[0] = "WordNet" + " " + this.getMostFrequentSenseKey(wtd) + " -1";
     }
     return senses;
   }

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
 Tue Aug  4 07:48:48 2015
@@ -40,10 +40,10 @@ public class LeskParameters extends WSDP
 
   // DEFAULTS
   protected static final LESK_TYPE DFLT_LESK_TYPE = 
LESK_TYPE.LESK_EXT_EXP_CTXT_WIN;
-  protected static final int DFLT_WIN_SIZE = 4;
-  protected static final int DFLT_DEPTH = 3;
-  protected static final double DFLT_IEXP = 0.3;
-  protected static final double DFLT_DEXP = 0.3;
+  protected static final int DFLT_WIN_SIZE = 5;
+  protected static final int DFLT_DEPTH = 2;
+  protected static final double DFLT_IEXP = 0.4;
+  protected static final double DFLT_DEXP = 0.4;
 
   protected LESK_TYPE leskType;
   protected int win_f_size;

Modified: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
 Tue Aug  4 07:48:48 2015
@@ -22,7 +22,7 @@ package opennlp.tools.disambiguator;
 import java.io.File;
 import java.util.ArrayList;
 
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
 import opennlp.tools.disambiguator.ims.IMS;
 import opennlp.tools.disambiguator.ims.IMSParameters;
 import opennlp.tools.disambiguator.ims.WTDIMS;

Modified: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
 Tue Aug  4 07:48:48 2015
@@ -19,10 +19,9 @@
 
 package opennlp.tools.disambiguator;
 
-import java.io.File;
 import java.util.ArrayList;
-import java.util.HashMap;
 
+import opennlp.tools.disambiguator.datareader.SensevalReader;
 import opennlp.tools.disambiguator.ims.WTDIMS;
 import opennlp.tools.disambiguator.lesk.Lesk;
 import opennlp.tools.disambiguator.lesk.LeskParameters;
@@ -31,80 +30,58 @@ import org.junit.Test;
 
 public class LeskEvaluatorTest {
 
-  static DataExtractor dExtractor = new DataExtractor();
+  static SensevalReader seReader = new SensevalReader();
 
   @Test
   public static void main(String[] args) {
     Constants.print("Evaluation Started");
 
-    String testDataLoc = "src\\test\\resources\\data\\";
-    String helpersLoc = "src\\test\\resources\\helpers\\";
-
-    File[] listOfFiles;
-    File testFolder = new File(testDataLoc);
-
-    // these are needed for mapping the sense IDs from the current data
-    String dict = helpersLoc + "EnglishLS.dictionary.xml";
-    String map = helpersLoc + "EnglishLS.sensemap";
-
     Lesk lesk = new Lesk();
     LeskParameters leskParams = new LeskParameters();
     leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_EXT_EXP_CTXT_WIN);
     lesk.setParams(leskParams);
 
-    if (testFolder.isDirectory()) {
-      listOfFiles = testFolder.listFiles();
-      for (File file : listOfFiles) {
-        WSDEvaluator evaluator = new WSDEvaluator(lesk);
-        if (file.isFile()) {
-          // don't take verbs because they are not from WordNet
-          if (!file.getName().split("\\.")[1].equals("v")) {
-            HashMap<String, ArrayList<DictionaryInstance>> senses = dExtractor
-                .extractWordSenses(dict, map, file.getName());
-            ArrayList<WTDIMS> instances = getTestData(file.getAbsolutePath(),
-                senses);
-
-            if (instances != null) {
-              Constants.print("------------------" + file.getName()
-                  + "------------------");
-              for (WordToDisambiguate instance : instances) {
-                // Constants.print("sense IDs : " + instance.senseIDs);
-                evaluator.evaluateSample(instance);
-              }
-              Constants.print(evaluator.toString());
-            } else {
-              Constants.print("null instances");
+    ArrayList<String> words = seReader.getSensevalWords();
+
+    for (String word : words) {
+      WSDEvaluator evaluator = new WSDEvaluator(lesk);
+
+      // don't take verbs because they are not from WordNet
+      if (!word.split("\\.")[1].equals("v")) {
+
+        ArrayList<WTDIMS> instances = getTestData(word);
+
+        if (instances != null) {
+          Constants.print("------------------" + word + "------------------");
+          for (WordToDisambiguate instance : instances) {
+
+            if (instance.getSenseIDs() != null
+                && !instance.getSenseIDs().get(0).equals("null")) {
+              evaluator.evaluateSample(instance);
             }
           }
+          Constants.print(evaluator.toString());
+        } else {
+          Constants.print("null instances");
         }
       }
+
     }
   }
 
-  protected static ArrayList<WTDIMS> getTestData(String testFile,
-      HashMap<String, ArrayList<DictionaryInstance>> senses) {
-    /**
-     * word tag has to be in the format "word.POS" (e.g., "activate.v",
-     * "smart.a", etc.)
-     */
-    ArrayList<WTDIMS> trainingData = dExtractor.extractWSDInstances(testFile);
-
-    // HashMap<Integer, WTDIMS> trainingData =
-    // dExtractor.extractWSDInstances(wordTrainingxmlFile);
-    for (WTDIMS data : trainingData) {
-      for (String senseId : data.getSenseIDs()) {
-        for (String dictKey : senses.keySet()) {
-          for (DictionaryInstance instance : senses.get(dictKey)) {
-            if (senseId.equals(instance.getId())) {
-              data.setSense(Integer.parseInt(dictKey.split("_")[1]));
-              break;
-            }
-          }
+  protected static ArrayList<WTDIMS> getTestData(String wordTag) {
+
+    ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>();
+    for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
+      WTDIMS wtdims = new WTDIMS(wtd);
+      if (wtdims != null) {
+        if (wtdims.getSenseIDs().get(0) != null
+            && !wtdims.getSenseIDs().get(0).equalsIgnoreCase("U")) {
+          instances.add(wtdims);
         }
       }
     }
-
-    return trainingData;
+    return instances;
   }
 
 }

Modified: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
 Tue Aug  4 07:48:48 2015
@@ -21,7 +21,7 @@ package opennlp.tools.disambiguator;
 
 import java.util.ArrayList;
 
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
 import opennlp.tools.disambiguator.ims.WTDIMS;
 import opennlp.tools.disambiguator.mfs.MFS;
 import opennlp.tools.disambiguator.mfs.MFSParameters;

svn commit: r1694008 - in /opennlp/sandbox/opennlp-wsd: ./ src/main/java/opennlp/tools/disambiguator/ src/main/java/opennlp/tools/disambiguator/DatasetsReader/ src/main/java/opennlp/tools/disambiguator/datareader/ src/main/java/opennlp/tools/disambigua...

Reply via email to