im...

joern Thu, 20 Aug 2015 15:02:52 -0700

Author: joern
Date: Thu Aug 20 22:01:59 2015
New Revision: 1696865

URL: http://svn.apache.org/r1696865
Log:
OPENNLP-801 1- IMS now no longer does the pre-processing steps (The user will 
have to introduce them). Thanks to Mondher Bouazizi  for providing a patch!


Modified:
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordToDisambiguate.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClusterer.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFS.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/mfs/MFSParameters.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSTester.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskTester.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSTester.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
 Thu Aug 20 22:01:59 2015
@@ -1,403 +1 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-
-import opennlp.tools.disambiguator.lesk.Lesk;
-import net.sf.extjwnl.JWNLException;
-import net.sf.extjwnl.data.POS;
-
-public class Constants {
-
-  private static String resourcesFolder = "src\\test\\resources\\";
-
-  private static String englishDict = resourcesFolder
-      + "models\\en-lemmatizer.dict";
-
-  public static String osPathChar = "\\";
-
-  // List of all the PoS tags
-  public static String[] allPOS = { "CC", "CD", "DT", "EX", "FW", "IN", "JJ",
-      "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS",
-      "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD",
-      "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB" };
-
-  // List of the PoS tags of which the senses are to be extracted
-  public static String[] relevantPOS = { "JJ", "JJR", "JJS", "NN", "NNS", "RB",
-      "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ" };
-
-  // List of Negation Words
-  public static ArrayList<String> negationWords = new ArrayList<String>(
-      Arrays.asList("not", "no", "never", "none", "nor", "non"));
-
-  // List of Stop Words
-  public static ArrayList<String> stopWords = new ArrayList<String>(
-      Arrays.asList("a", "able", "about", "above", "according", "accordingly",
-          "across", "actually", "after", "afterwards", "again", "against",
-          "ain't", "all", "allow", "allows", "almost", "alone", "along",
-          "already", "also", "although", "always", "am", "among", "amongst",
-          "an", "and", "another", "any", "anybody", "anyhow", "anyone",
-          "anything", "anyway", "anyways", "anywhere", "apart", "appear",
-          "appreciate", "appropriate", "are", "aren't", "around", "as",
-          "aside", "ask", "asking", "associated", "at", "available", "away",
-          "awfully", "be", "became", "because", "become", "becomes",
-          "becoming", "been", "before", "beforehand", "behind", "being",
-          "believe", "below", "beside", "besides", "best", "better", "between",
-          "beyond", "both", "brief", "but", "by", "came", "can", "cannot",
-          "cant", "can't", "cause", "causes", "certain", "certainly",
-          "changes", "clearly", "c'mon", "co", "com", "come", "comes",
-          "concerning", "consequently", "consider", "considering", "contain",
-          "containing", "contains", "corresponding", "could", "couldn't",
-          "course", "c's", "currently", "definitely", "described", "despite",
-          "did", "didn't", "different", "do", "does", "doesn't", "doing",
-          "done", "don't", "down", "downwards", "during", "each", "edu", "eg",
-          "eight", "either", "else", "elsewhere", "enough", "entirely",
-          "especially", "et", "etc", "even", "ever", "every", "everybody",
-          "everyone", "everything", "everywhere", "ex", "exactly", "example",
-          "except", "far", "few", "fifth", "first", "five", "followed",
-          "following", "follows", "for", "former", "formerly", "forth", "four",
-          "from", "further", "furthermore", "get", "gets", "getting", "given",
-          "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings",
-          "had", "hadn't", "happens", "hardly", "has", "hasn't", "have",
-          "haven't", "having", "he", "hello", "help", "hence", "her", "here",
-          "hereafter", "hereby", "herein", "here's", "hereupon", "hers",
-          "herself", "he's", "hi", "him", "himself", "his", "hither",
-          "hopefully", "how", "howbeit", "however", "i", "i'd", "ie", "if",
-          "ignored", "i'll", "i'm", "immediate", "in", "inasmuch", "inc",
-          "indeed", "indicate", "indicated", "indicates", "inner", "insofar",
-          "instead", "into", "inward", "is", "isn't", "it", "it'd", "it'll",
-          "its", "it's", "itself", "i've", "just", "keep", "keeps", "kept",
-          "know", "known", "knows", "last", "lately", "later", "latter",
-          "latterly", "least", "less", "lest", "let", "let's", "like", "liked",
-          "likely", "little", "look", "looking", "looks", "ltd", "mainly",
-          "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might",
-          "more", "moreover", "most", "mostly", "much", "must", "my", "myself",
-          "name", "namely", "nd", "near", "nearly", "necessary", "need",
-          "needs", "neither", "never", "nevertheless", "new", "next", "nine",
-          "no", "nobody", "non", "none", "noone", "nor", "normally", "not",
-          "nothing", "novel", "now", "nowhere", "obviously", "of", "off",
-          "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones",
-          "only", "onto", "or", "other", "others", "otherwise", "ought", "our",
-          "ours", "ourselves", "out", "outside", "over", "overall", "own",
-          "particular", "particularly", "per", "perhaps", "placed", "please",
-          "plus", "possible", "presumably", "probably", "provides", "que",
-          "quite", "qv", "rather", "rd", "re", "really", "reasonably",
-          "regarding", "regardless", "regards", "relatively", "respectively",
-          "right", "said", "same", "saw", "say", "saying", "says", "second",
-          "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems",
-          "seen", "self", "selves", "sensible", "sent", "serious", "seriously",
-          "seven", "several", "shall", "she", "should", "shouldn't", "since",
-          "six", "so", "some", "somebody", "somehow", "someone", "something",
-          "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry",
-          "specified", "specify", "specifying", "still", "sub", "such", "sup",
-          "sure", "take", "taken", "tell", "tends", "th", "than", "thank",
-          "thanks", "thanx", "that", "thats", "that's", "the", "their",
-          "theirs", "them", "themselves", "then", "thence", "there",
-          "thereafter", "thereby", "therefore", "therein", "theres", "there's",
-          "thereupon", "these", "they", "they'd", "they'll", "they're",
-          "they've", "think", "third", "this", "thorough", "thoroughly",
-          "those", "though", "three", "through", "throughout", "thru", "thus",
-          "to", "together", "too", "took", "toward", "towards", "tried",
-          "tries", "truly", "try", "trying", "t's", "twice", "two", "un",
-          "under", "unfortunately", "unless", "unlikely", "until", "unto",
-          "up", "upon", "us", "use", "used", "useful", "uses", "using",
-          "usually", "value", "various", "very", "via", "viz", "vs", "want",
-          "wants", "was", "wasn't", "way", "we", "we'd", "welcome", "well",
-          "we'll", "went", "were", "we're", "weren't", "we've", "what",
-          "whatever", "what's", "when", "whence", "whenever", "where",
-          "whereafter", "whereas", "whereby", "wherein", "where's",
-          "whereupon", "wherever", "whether", "which", "while", "whither",
-          "who", "whoever", "whole", "whom", "who's", "whose", "why", "will",
-          "willing", "wish", "with", "within", "without", "wonder", "won't",
-          "would", "wouldn't", "yes", "yet", "you", "you'd", "you'll", "your",
-          "you're", "yours", "yourself", "yourselves", "you've", "zero"));
-
-  // Print a text in the console
-  // Print a text in the console
-  public static void printResults(WSDisambiguator disambiguator,
-      String[] results) {
-
-    if (results != null) {
-
-      String[] parts;
-      String sensekey;
-      if (disambiguator instanceof Lesk) {
-
-        Double score;
-     
-        for (int i = 0; i < results.length; i++) {
-          parts = results[i].split(" ");
-          sensekey = parts[1];
-          score = Double.parseDouble(parts[2]);
-          try {
-            Constants.print("score : "
-                + score
-                + " for sense "
-                + i
-                + " : "
-                + sensekey
-                + " : "
-                + Loader.getDictionary().getWordBySenseKey(sensekey)
-                    .getSynset().getGloss());
-          } catch (JWNLException e) {
-            e.printStackTrace();
-          }
-        }
-      } else {
-        for (int i = 0; i < results.length; i++) {
-          parts = results[i].split(" ");
-          sensekey = parts[1];
-          try {
-            Constants.print("sense "
-                + i
-                + " : "
-                + sensekey
-                + " : "
-                + Loader.getDictionary().getWordBySenseKey(sensekey)
-                    .getSynset().getGloss());
-          } catch (JWNLException e) {
-            e.printStackTrace();
-          }
-        }
-      }
-    }
-
-  }
-
-  public static void print(Object in) {
-    if (in == null) {
-      System.out.println("object is null");
-    } else {
-      System.out.println(in);
-    }
-  }
-
-  public static void print(Object[] array) {
-    if (array == null) {
-      System.out.println("object is null");
-    } else {
-      System.out.println(Arrays.asList(array));
-    }
-  }
-
-  public static void print(Object[][] array) {
-    if (array == null) {
-      System.out.println("object is null");
-    } else {
-      System.out.print("[");
-      for (int i = 0; i < array.length; i++) {
-        print(array[i]);
-        if (i != array.length - 1) {
-          System.out.print("\n");
-        }
-        print("]");
-      }
-    }
-  }
-
-  /**
-   * Extract the list of ALL English words
-   * 
-   * @param dict
-   *          this file is the same that is used in the simple Lemmatizer
-   *          (i.e.,"en-lemmatizer.dict")
-   * 
-   * @return a list of all the English words
-   */
-  public static HashMap<String, Object> getEnglishWords(String dict) {
-
-    HashMap<String, Object> words = new HashMap<String, Object>();
-
-    BufferedReader br = null;
-
-    File file = new File(englishDict);
-
-    if (file.exists()) {
-
-      try {
-        br = new BufferedReader(new FileReader(file));
-        String line = br.readLine();
-        while (line != null) {
-          line = br.readLine();
-          if (line != null) {
-            String word = line.split("\\t")[0];
-            words.put(word, null);
-          }
-        }
-      } catch (FileNotFoundException e) {
-        e.printStackTrace();
-      } catch (IOException e) {
-        e.printStackTrace();
-      } finally {
-        if (br != null) {
-          try {
-            br.close();
-          } catch (IOException e) {
-            e.printStackTrace();
-          }
-        }
-      }
-    }
-
-    return words;
-  }
-
-  /**
-   * return the PoS (Class POS) out of the PoS-tag
-   * 
-   * @param posTag
-   *          PoS tag (e.g., "JJS", "NNP", etc.)
-   * @return the Part of Speech (type {@link POS})
-   */
-  public static POS getPOS(String posTag) {
-
-    ArrayList<String> adjective = new ArrayList<String>(Arrays.asList("JJ",
-        "JJR", "JJS"));
-    ArrayList<String> adverb = new ArrayList<String>(Arrays.asList("RB", "RBR",
-        "RBS"));
-    ArrayList<String> noun = new ArrayList<String>(Arrays.asList("NN", "NNS",
-        "NNP", "NNPS"));
-    ArrayList<String> verb = new ArrayList<String>(Arrays.asList("VB", "VBD",
-        "VBG", "VBN", "VBP", "VBZ"));
-
-    if (adjective.contains(posTag))
-      return POS.ADJECTIVE;
-    else if (adverb.contains(posTag))
-      return POS.ADVERB;
-    else if (noun.contains(posTag))
-      return POS.NOUN;
-    else if (verb.contains(posTag))
-      return POS.VERB;
-    else
-      return null;
-
-  }
-
-  /**
-   * Check whether a PoS Tag is relevant of not. A PoS Tag is considered
-   * relevant when it corresponds to:
-   * <ul>
-   * <li>VERB</li>
-   * <li>ADJECTIVE</li>
-   * <li>ADVERB</li>
-   * <li>NOUN</li>
-   * </ul>
-   * 
-   * @param posTag
-   *          the PoS Tag to verify the relevance.
-   * @return whether a PoS Tag corresponds to a relevant Part of Speech (type
-   *         {@link POS}) or not ( true} if it is, false} otherwise)
-   */
-  public static boolean isRelevant(String posTag) {
-    return getPOS(posTag) != null;
-  }
-
-  /**
-   * Check whether a PoS Tag is relevant of not. A PoS Tag is considered
-   * relevant when it is:
-   * <ul>
-   * <li>VERB</li>
-   * <li>ADJECTIVE</li>
-   * <li>ADVERB</li>
-   * <li>NOUN</li>
-   * </ul>
-   * 
-   * @param pos
-   *          The Part of Speech of Type {@link POS}
-   * @return whether a Part of Speech is relevant (true) or not (false)
-   */
-  public static boolean isRelevant(POS pos) {
-    return pos.equals(POS.ADJECTIVE) || pos.equals(POS.ADVERB)
-        || pos.equals(POS.NOUN) || pos.equals(POS.VERB);
-  }
-
-  public static String getPOSabbreviation(String posTag) {
-
-    if (posTag == null) {
-      return null;
-    }
-    if (posTag.startsWith("JJ")) {
-      return "a";
-    } else if (posTag.startsWith("RB")) {
-      return "r";
-    } else if (posTag.startsWith("VB") || posTag.equals("MD")) {
-      return "v";
-    } else if (posTag.startsWith("NN")) {
-      return "n";
-    }
-
-    return null;
-
-  }
-
-  /**
-   * Check whether a list of arrays contains an array
-   * 
-   * @param array
-   *          The array To check
-   * @param fullList
-   *          The full list of Arrays
-   * @return whether the {@link ArrayList} of arrays contains the array (true)
-   *         or not (false)
-   */
-  public static boolean belongsTo(String[] array, ArrayList<String[]> 
fullList) {
-    for (String[] refArray : fullList) {
-      if (areStringArraysEqual(array, refArray))
-        return true;
-    }
-    return false;
-  }
-
-  /**
-   * Check whether two arrays of strings are equal
-   * 
-   * @param array1
-   *          first array
-   * @param array2
-   *          second array
-   * @return whether the two arrays are identical (true) or not (false)
-   */
-  public static boolean areStringArraysEqual(String[] array1, String[] array2) 
{
-
-    if (array1.equals(null) || array2.equals(null))
-      return false;
-
-    if (array1.length != array2.length) {
-      return false;
-    }
-    for (int i = 0; i < array1.length; i++) {
-      if (!array1[i].equals(array2[i])) {
-        return false;
-      }
-    }
-
-    return true;
-
-  }
-
-}
+// TODO to be removed
\ No newline at end of file

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java
 Thu Aug 20 22:01:59 2015
@@ -1,414 +1 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-import opennlp.tools.disambiguator.DictionaryInstance;
-import opennlp.tools.disambiguator.ims.WTDIMS;
-
-/**
- * This class handles the extraction of data from the different files (training
- * data, dictionary instances, etc.)
- */
-
-public class DataExtractor {
-
-  private static String englishDict = 
"src\\test\\resources\\models\\en-lemmatizer.dict";
-
-  /**
-   * Constructor
-   */
-  public DataExtractor() {
-    super();
-  }
-
-  private ArrayList<DictionaryInstance> extractDictionary(String xmlLocation) {
-
-    ArrayList<DictionaryInstance> dictionary = new 
ArrayList<DictionaryInstance>();
-
-    try {
-
-      File xmlFile = new File(xmlLocation);
-      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
-      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
-      Document doc = dBuilder.parse(xmlFile);
-      doc.getDocumentElement().normalize();
-
-      NodeList nLexelts = doc.getElementsByTagName("lexelt");
-
-      int index = 0;
-
-      for (int i = 0; i < nLexelts.getLength(); i++) {
-
-        Node nLexelt = nLexelts.item(i);
-
-        Element eLexelt = (Element) nLexelt;
-
-        String word = eLexelt.getAttribute("item");
-
-        if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
-
-          NodeList nSenses = eLexelt.getChildNodes();
-
-          for (int j = 0; j < nSenses.getLength(); j++) {
-
-            if (nSenses.item(j).getNodeType() == Node.ELEMENT_NODE) {
-
-              Element eSense = (Element) nSenses.item(j);
-
-              int ind = index; // rather use this than the ID used by default
-              String id = eSense.getAttribute("id");
-              String source = eSense.getAttribute("source");
-              String[] synset = eSense.getAttribute("synset").split("\\s");
-              String gloss = eSense.getAttribute("gloss");
-
-              DictionaryInstance wd = new DictionaryInstance(ind, word, id,
-                  source, synset, gloss);
-
-              dictionary.add(wd);
-              index++;
-            }
-          }
-
-        }
-      }
-    } catch (Exception e) {
-      e.printStackTrace();
-    }
-
-    return dictionary;
-
-  }
-
-  private HashMap<Integer, ArrayList<String>> getEquivalentSense(
-      String sensemapFile) {
-
-    HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, 
ArrayList<String>>();
-
-    try (BufferedReader wordsList = new BufferedReader(new FileReader(
-        sensemapFile))) {
-
-      int index = 0;
-
-      String line;
-
-      // Read the file
-      while ((line = wordsList.readLine()) != null) {
-
-        String[] temp = line.split("\\s");
-
-        ArrayList<String> tempSenses = new ArrayList<String>();
-
-        for (String sense : temp) {
-          if (sense.length() > 1) {
-            // System.out.println(sense);
-            tempSenses.add(sense);
-          }
-        }
-
-        mappedSenses.put(index, tempSenses);
-        // System.out.println(index);
-        index++;
-
-      }
-
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
-
-    return mappedSenses;
-
-  }
-
-  private HashMap<String, ArrayList<DictionaryInstance>> 
extractCoarseGrainedDictionary(
-      String xmlLocation, String sensemapFile) {
-
-    HashMap<String, ArrayList<DictionaryInstance>> optimizedDictionary = new 
HashMap<String, ArrayList<DictionaryInstance>>();
-
-    HashMap<Integer, ArrayList<String>> equivalentSenses = 
getEquivalentSense(sensemapFile);
-
-    ArrayList<DictionaryInstance> dictionary = extractDictionary(xmlLocation);
-
-    for (int mapKey : equivalentSenses.keySet()) {
-      ArrayList<String> sensesIds = equivalentSenses.get(mapKey);
-      ArrayList<DictionaryInstance> optimizedDictionaryInstance = new 
ArrayList<DictionaryInstance>();
-
-      String word = "";
-
-      for (String senseId : sensesIds) {
-        for (int i = 0; i < dictionary.size(); i++) {
-          if (dictionary.get(i).getId().equals(senseId)) {
-            optimizedDictionaryInstance.add(dictionary.get(i));
-            word = dictionary.get(i).getWord();
-            word = word + "_" + mapKey;
-            break;
-          }
-        }
-
-      }
-
-      optimizedDictionary.put(word, optimizedDictionaryInstance);
-    }
-
-    return optimizedDictionary;
-  }
-
-  /**
-   * Extract the different senses (those which are equivalent are put together)
-   * of a word
-   * 
-   * @param xmlLocation
-   *          : location of the file containing the dictionary instances
-   * @param sensemapFile
-   *          : location of the file containing the equivalent senses in the
-   *          case of Coarse-grained disambiguation
-   * @param wordTag
-   *          : the word to disambiguate. It should be written in the format
-   *          "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
-   * @return a {@link HashMap} of {@link DictionaryInstance} with their IDs
-   */
-  public HashMap<String, ArrayList<DictionaryInstance>> extractWordSenses(
-      String xmlLocation, String sensemapFile, String wordTag) {
-
-    /**
-     * word tag has to be in the format "word.t" (e.g., "activate.v", 
"smart.a",
-     * etc.)
-     */
-
-    HashMap<String, ArrayList<DictionaryInstance>> wordSenses = new 
HashMap<String, ArrayList<DictionaryInstance>>();
-
-    HashMap<String, ArrayList<DictionaryInstance>> optimalDictionary = 
extractCoarseGrainedDictionary(
-        xmlLocation, sensemapFile);
-
-    int i = 0;
-    for (String key : optimalDictionary.keySet()) {
-      if (key.startsWith(wordTag)) {
-        String newKey = wordTag + "_" + i;
-        wordSenses.put(newKey, optimalDictionary.get(key));
-        i++;
-      }
-    }
-
-    return wordSenses;
-  }
-
-  /**
-   * Extract the different senses. This class returns only the ID of the sense
-   * and the gloss. the synsets and other information are omitted.
-   * 
-   * @param xmlLocation
-   *          : location of the file containing the dictionary instances
-   * @param sensemapFile
-   *          : location of the file containing the equivalent senses in the
-   *          case of Coarse-grained disambiguation
-   * @param wordTag
-   *          the word to disambiguate. It should be written in the format
-   *          "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
-   * @return a {@link HashMap} of word senses with their IDs
-   */
-  public HashMap<String, String> getDictionaryInstance(String xmlLocation,
-      String sensemapFile, String wordTag) {
-
-    HashMap<String, ArrayList<DictionaryInstance>> dict = extractWordSenses(
-        xmlLocation, sensemapFile, wordTag);
-
-    HashMap<String, String> senses = new HashMap<String, String>();
-
-    for (String key : dict.keySet()) {
-      String sense = dict.get(key).get(0).getGloss();
-      senses.put(key, sense);
-    }
-
-    return senses;
-
-  }
-
-  /**
-   * Extract the training instances from the training/test set File
-   * 
-   * @param xmlDataSet
-   *          : the file from which the data are to be extracted
-   * @return {@link ArrayList} of Word To Disambiguate (WTDIMS) instances
-   */
-  public ArrayList<WTDIMS> extractWSDInstances(String xmlDataSet) {
-
-    ArrayList<WTDIMS> setInstances = new ArrayList<WTDIMS>();
-
-    try {
-
-      File xmlFile = new File(xmlDataSet);
-      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
-      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
-      Document doc = dBuilder.parse(xmlFile);
-
-      doc.getDocumentElement().normalize();
-
-      NodeList lexelts = doc.getElementsByTagName("lexelt");
-
-      for (int i = 0; i < lexelts.getLength(); i++) {
-
-        Node nLexelt = lexelts.item(i);
-
-        if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
-          Element eLexelt = (Element) nLexelt;
-
-          NodeList nInstances = nLexelt.getChildNodes();
-
-          for (int j = 1; j < nInstances.getLength(); j++) {
-
-            Node nInstance = nInstances.item(j);
-
-            if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
-
-              Element eInstance = (Element) nInstance;
-
-              String[] wordPos = eLexelt.getAttribute("item").split("\\.");
-              String word = wordPos[0]; // Word
-              String tag; // Part of Speech
-
-              if (wordPos[1].equals("n")) {
-                tag = "noun";
-              } else if (wordPos[1].equals("v")) {
-                tag = "verb";
-              } else if (wordPos[1].equals("a")) {
-                tag = "adjective";
-              } else {
-                tag = "adverb";
-              }
-
-              String id = eInstance.getAttribute("id");
-              String source = eInstance.getAttribute("docsrc");
-
-              ArrayList<String> answers = new ArrayList<String>();
-              String sentence = "";
-              String rawWord = "";
-
-              NodeList nChildren = nInstance.getChildNodes();
-
-              for (int k = 1; k < nChildren.getLength(); k++) {
-                Node nChild = nChildren.item(k);
-
-                if (nChild.getNodeName().equals("answer")) {
-                  // String answer =
-                  // nChild.getAttributes().item(0).getTextContent();
-                  String senseid = nChild.getAttributes().item(1)
-                      .getTextContent();
-
-                  String temp = senseid;
-                  // String[] temp = { answer, senseid };
-                  answers.add(temp);
-                }
-
-                if (nChild.getNodeName().equals("context")) {
-                  sentence = ((Element) nChild).getTextContent();
-
-                  if (nChild.hasChildNodes()) {
-                    // textbefore =
-                    // nChild.getChildNodes().item(0).getTextContent();
-                    rawWord = nChild.getChildNodes().item(1).getTextContent();
-                    // textAfter =
-                    // nChild.getChildNodes().item(2).getTextContent();
-                  }
-                }
-
-              }
-
-              WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
-                  rawWord);
-              setInstances.add(wordToDisambiguate);
-            }
-
-          }
-
-        }
-
-      }
-
-    } catch (Exception e) {
-      e.printStackTrace();
-    }
-
-    return setInstances;
-
-  }
-
-  /**
-   * Extract the list of ALL English words
-   * 
-   * @param dict
-   *          : this file is the same that is used in the simple lemmatizer
-   *          (i.e.,"en-lemmatizer.dict")
-   * 
-   * @return a list of all the english words
-   */
-  public HashMap<String, Object> getEnglishWords(String dict) {
-
-    HashMap<String, Object> words = new HashMap<String, Object>();
-
-    BufferedReader br = null;
-
-    File file = new File(englishDict);
-
-    if (file.exists()) {
-
-      try {
-        br = new BufferedReader(new FileReader(file));
-        String line = br.readLine();
-        while (line != null) {
-          line = br.readLine();
-          if (line != null) {
-            String word = line.split("\\t")[0];
-            words.put(word, null);
-          }
-        }
-      } catch (FileNotFoundException e) {
-        e.printStackTrace();
-      } catch (IOException e) {
-        e.printStackTrace();
-      } finally {
-        if (br != null) {
-          try {
-            br.close();
-          } catch (IOException e) {
-            e.printStackTrace();
-          }
-        }
-      }
-    }
-
-    return words;
-  }
-
-}
\ No newline at end of file
+// TODO this is to be removed
\ No newline at end of file

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DictionaryInstance.java
 Thu Aug 20 22:01:59 2015
@@ -1,107 +1 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-/**
- * An instance of the dictionary. A dictionary instance has:
- * <ul>
- * <li>index: an index for the current instance of the dictionary</li>
- * <li>word: the word to disambiguate</li>
- * <li>id: its id in the source (e.g., in WordNet, Wordsmyth, etc.)</li>
- * <li>source: the source of the instance (e.g., WordNet, Wordsmyth, etc.)</li>
- * <li>synset: the list of synonyms (i.e., the words that share the same 
current
- * meaning)</li>
- * <li>gloss: the sense of the word</li>
- * </ul>
- */
-public class DictionaryInstance {
-
-  protected int index;
-
-  protected String word;
-
-  protected String id;
-  protected String source;
-  protected String[] synset;
-  protected String gloss;
-
-  /**
-   * Constructor
-   */
-  public DictionaryInstance(int index, String word, String id, String source,
-      String[] synset, String gloss) {
-    super();
-    this.index = index;
-    this.word = word;
-    this.id = id;
-    this.source = source;
-    this.synset = synset;
-    this.gloss = gloss;
-  }
-
-  public int getIndex() {
-    return index;
-  }
-
-  public void setIndex(int index) {
-    this.index = index;
-  }
-
-  public String getWord() {
-    return word;
-  }
-
-  public void setWord(String word) {
-    this.word = word;
-  }
-
-  public String getId() {
-    return id;
-  }
-
-  public void setId(String id) {
-    this.id = id;
-  }
-
-  public String getSource() {
-    return source;
-  }
-
-  public void setSource(String source) {
-    this.source = source;
-  }
-
-  public String[] getSynset() {
-    return synset;
-  }
-
-  public void setSynset(String[] synset) {
-    this.synset = synset;
-  }
-
-  public String getGloss() {
-    return gloss;
-  }
-
-  public void setGloss(String gloss) {
-    this.gloss = gloss;
-  }
-
-}
+// TODO  to be removed
\ No newline at end of file

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DistributionInstance.java
 Thu Aug 20 22:01:59 2015
@@ -1,73 +1 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-public class DistributionInstance {
-
-  protected String word;
-  protected String pos;
-  protected int trainingSetInstances;
-  protected int testSetInstances;
-
-  /**
-   * Constructor
-   */
-  public DistributionInstance(String word, String pos,
-      int trainingSetInstances, int testSetInstances) {
-    super();
-    this.word = word;
-    this.pos = pos;
-    this.trainingSetInstances = trainingSetInstances;
-    this.testSetInstances = testSetInstances;
-  }
-
-  public String getWord() {
-    return word;
-  }
-
-  public void setWord(String word) {
-    this.word = word;
-  }
-
-  public String getPos() {
-    return pos;
-  }
-
-  public void setPos(String pos) {
-    this.pos = pos;
-  }
-
-  public int getTrainingSetInstances() {
-    return trainingSetInstances;
-  }
-
-  public void setTrainingSetInstances(int trainingSetInstances) {
-    this.trainingSetInstances = trainingSetInstances;
-  }
-
-  public int getTestSetInstances() {
-    return testSetInstances;
-  }
-
-  public void setTestSetInstances(int testSetInstances) {
-    this.testSetInstances = testSetInstances;
-  }
-
-}
+// TODO to be removed
\ No newline at end of file

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java
 Thu Aug 20 22:01:59 2015
@@ -21,12 +21,13 @@ package opennlp.tools.disambiguator;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 
 import opennlp.tools.disambiguator.ims.WTDIMS;
 
 /**
  * Class for the extraction of features for the different Supervised
- * Disambiguation apporaches.<br>
+ * Disambiguation approaches.<br>
  * Each set of methods refer to one approach
  * <ul>
  * <li>IMS (It Makes Sense): check {@link https
@@ -52,17 +53,18 @@ public class FeaturesExtractor {
 
   // IMS
 
-  private String[] extractPosOfSurroundingWords(String[] sentence,
-      int wordIndex, int windowSize) {
+  private String[] extractPosOfSurroundingWords(WTDIMS wordToDisambiguate,
+      int windowSize) {
 
-    String[] taggedSentence = WSDHelper.getTagger().tag(sentence);
+    String[] taggedSentence = wordToDisambiguate.getPosTags();
 
     String[] tags = new String[2 * windowSize + 1];
 
     int j = 0;
 
-    for (int i = wordIndex - windowSize; i < wordIndex + windowSize; i++) {
-      if (i < 0 || i >= sentence.length) {
+    for (int i = wordToDisambiguate.getWordIndex() - windowSize; i < 
wordToDisambiguate
+        .getWordIndex() + windowSize; i++) {
+      if (i < 0 || i >= wordToDisambiguate.getSentence().length) {
         tags[j] = "null";
       } else {
         tags[j] = taggedSentence[i].toLowerCase();
@@ -73,33 +75,30 @@ public class FeaturesExtractor {
     return tags;
   }
 
-  private String[] extractSurroundingWords(String[] sentence, int wordIndex) {
-
-    String[] posTags = WSDHelper.getTagger().tag(sentence);
+  private String[] extractSurroundingWords(WTDIMS wordToDisambiguate) {
 
     ArrayList<String> contextWords = new ArrayList<String>();
 
-    for (int i = 0; i < sentence.length; i++) {
-
-      if (!WSDHelper.stopWords.contains(sentence[i].toLowerCase())
-          && (wordIndex != i)) {
+    for (int i = 0; i < wordToDisambiguate.getSentence().length; i++) {
+      if (wordToDisambiguate.getLemmas() != null) {
+        if (!WSDHelper.stopWords.contains(wordToDisambiguate.getSentence()[i]
+            .toLowerCase()) && (wordToDisambiguate.getWordIndex() != i)) {
+
+          String lemma = wordToDisambiguate.getLemmas()[i].toLowerCase()
+              .replaceAll("[^a-z_]", "").trim();
+
+          if (lemma.length() > 1) {
+            contextWords.add(lemma);
+          }
 
-        String word = sentence[i].toLowerCase().replaceAll("[^a-z]", 
"").trim();
-
-        // if (!word.equals("") /*&& Constants.isRelevant(posTags[i])*/) {
-        if (WSDHelper.getEnglishWords().containsKey(word)) {
-          String lemma = WSDHelper.getLemmatizer().lemmatize(word, posTags[i]);
-          contextWords.add(lemma);
         }
-
       }
     }
 
     return contextWords.toArray(new String[contextWords.size()]);
   }
 
-  private String[] extractLocalCollocations(String[] sentence, int wordIndex,
-      int ngram) {
+  private String[] extractLocalCollocations(WTDIMS wordToDisambiguate, int 
ngram) {
     /**
      * Here the author used only 11 features of this type. the range was set to
      * 3 (bigrams extracted in a way that they are at max separated by 1 word).
@@ -107,17 +106,22 @@ public class FeaturesExtractor {
 
     ArrayList<String> localCollocations = new ArrayList<String>();
 
-    for (int i = wordIndex - ngram; i <= wordIndex + ngram; i++) {
+    for (int i = wordToDisambiguate.getWordIndex() - ngram; i <= 
wordToDisambiguate
+        .getWordIndex() + ngram; i++) {
 
-      if (!(i < 0 || i > sentence.length - 3)) {
-        if ((i != wordIndex) && (i + 1 != wordIndex)
-            && (i + 1 < wordIndex + ngram)) {
-          String lc = (sentence[i] + " " + sentence[i + 1]).toLowerCase();
+      if (!(i < 0 || i > wordToDisambiguate.getSentence().length - 3)) {
+        if ((i != wordToDisambiguate.getWordIndex())
+            && (i + 1 != wordToDisambiguate.getWordIndex())
+            && (i + 1 < wordToDisambiguate.getWordIndex() + ngram)) {
+          String lc = (wordToDisambiguate.getSentence()[i] + " " + 
wordToDisambiguate
+              .getSentence()[i + 1]).toLowerCase();
           localCollocations.add(lc);
         }
-        if ((i != wordIndex) && (i + 2 != wordIndex)
-            && (i + 2 < wordIndex + ngram)) {
-          String lc = (sentence[i] + " " + sentence[i + 2]).toLowerCase();
+        if ((i != wordToDisambiguate.getWordIndex())
+            && (i + 2 != wordToDisambiguate.getWordIndex())
+            && (i + 2 < wordToDisambiguate.getWordIndex() + ngram)) {
+          String lc = (wordToDisambiguate.getSentence()[i] + " " + 
wordToDisambiguate
+              .getSentence()[i + 2]).toLowerCase();
           localCollocations.add(lc);
         }
       }
@@ -141,15 +145,22 @@ public class FeaturesExtractor {
    */
   public ArrayList<String> extractTrainingSurroundingWords(
       ArrayList<WTDIMS> trainingData) {
-
-    ArrayList<String> list = new ArrayList<String>();
-
+    
+    HashMap<String, Object> words = new HashMap<String, Object>();
+    
     for (WTDIMS word : trainingData) {
       for (String sWord : word.getSurroundingWords()) {
-        list.add(sWord);
+        if (!words.containsKey(sWord.toLowerCase()));
+        words.put(sWord.toLowerCase(), null);
       }
     }
 
+    ArrayList<String> list = new ArrayList<String>();
+
+    for (String word : words.keySet()) {
+        list.add(word);
+    }
+
     return list;
 
   }
@@ -158,7 +169,7 @@ public class FeaturesExtractor {
    * This method generates the different set of features related to the IMS
    * approach and store them in the corresponding attributes of the WTDIMS
    * 
-   * @param word
+   * @param wordToDisambiguate
    *          the word to disambiguate [object: WTDIMS]
    * @param windowSize
    *          the parameter required to generate the features qualified of
@@ -167,14 +178,15 @@ public class FeaturesExtractor {
    *          the parameter required to generate the features qualified of
    *          "Local Collocations"
    */
-  public void extractIMSFeatures(WTDIMS word, int windowSize, int ngram) {
+  public void extractIMSFeatures(WTDIMS wordToDisambiguate, int windowSize,
+      int ngram) {
 
-    word.setPosOfSurroundingWords(extractPosOfSurroundingWords(
-        word.getSentence(), word.getWordIndex(), windowSize));
-    word.setSurroundingWords(extractSurroundingWords(word.getSentence(),
-        word.getWordIndex()));
-    word.setLocalCollocations(extractLocalCollocations(word.getSentence(),
-        word.getWordIndex(), ngram));
+    wordToDisambiguate.setPosOfSurroundingWords(extractPosOfSurroundingWords(
+        wordToDisambiguate, windowSize));
+    wordToDisambiguate
+        .setSurroundingWords(extractSurroundingWords(wordToDisambiguate));
+    wordToDisambiguate.setLocalCollocations(extractLocalCollocations(
+        wordToDisambiguate, ngram));
 
   }
 

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
 Thu Aug 20 22:01:59 2015
@@ -1,261 +1 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.HashMap;
-
-import net.sf.extjwnl.JWNLException;
-import net.sf.extjwnl.data.POS;
-import net.sf.extjwnl.dictionary.Dictionary;
-import net.sf.extjwnl.dictionary.MorphologicalProcessor;
-import opennlp.tools.cmdline.postag.POSModelLoader;
-import opennlp.tools.disambiguator.datareader.SensevalReader;
-import opennlp.tools.lemmatizer.SimpleLemmatizer;
-import opennlp.tools.namefind.NameFinderME;
-import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.postag.POSModel;
-import opennlp.tools.postag.POSTaggerME;
-import opennlp.tools.sentdetect.SentenceDetectorME;
-import opennlp.tools.sentdetect.SentenceModel;
-import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.tokenize.TokenizerME;
-import opennlp.tools.tokenize.TokenizerModel;
-import opennlp.tools.util.InvalidFormatException;
-
-public class Loader {
-
-  private static SensevalReader dExtractor = new SensevalReader();
-
-  private static String modelsDir = "src\\test\\resources\\models\\";
-
-  private static SentenceDetectorME sdetector;
-  private static Tokenizer tokenizer;
-  private static POSTaggerME tagger;
-  private static NameFinderME nameFinder;
-  private static SimpleLemmatizer lemmatizer;
-
-  private static Dictionary dictionary;
-  private static MorphologicalProcessor morph;
-
-  // local caches for faster lookup
-  private static HashMap<String, Object> stemCache;
-  private static HashMap<String, Object> stopCache;
-  private static HashMap<String, Object> relvCache;
-
-  private static HashMap<String, Object> englishWords;
-
-  public Loader() {
-    super();
-    load();
-  }
-
-  public static HashMap<String, Object> getRelvCache() {
-    if (relvCache == null || relvCache.keySet().isEmpty()) {
-      relvCache = new HashMap<String, Object>();
-      for (String t : Constants.relevantPOS) {
-        relvCache.put(t, null);
-      }
-    }
-    return relvCache;
-  }
-
-  public static HashMap<String, Object> getStopCache() {
-    if (stopCache == null || stopCache.keySet().isEmpty()) {
-      stopCache = new HashMap<String, Object>();
-      for (String s : Constants.stopWords) {
-        stopCache.put(s, null);
-      }
-    }
-    return stopCache;
-  }
-
-  public static HashMap<String, Object> getStemCache() {
-    if (stemCache == null || stemCache.keySet().isEmpty()) {
-      stemCache = new HashMap<String, Object>();
-      for (Object pos : POS.getAllPOS()) {
-        stemCache.put(((POS) pos).getKey(), new HashMap());
-      }
-    }
-    return stemCache;
-  }
-
-  public static HashMap<String, Object> getEnglishWords() {
-    if (englishWords == null || englishWords.keySet().isEmpty()) {
-      englishWords = Constants
-          .getEnglishWords(modelsDir + "en-lemmatizer.dict");
-    }
-    return englishWords;
-  }
-
-  public static MorphologicalProcessor getMorph() {
-    if (morph == null) {
-      getDictionary();
-      morph = dictionary.getMorphologicalProcessor();
-    }
-    return morph;
-  }
-
-  public static Dictionary getDictionary() {
-    if (dictionary == null) {
-      try {
-        dictionary = Dictionary.getDefaultResourceInstance();
-      } catch (JWNLException e) {
-        e.printStackTrace();
-      }
-    }
-    return dictionary;
-  }
-
-  public static SimpleLemmatizer getLemmatizer() {
-    if (lemmatizer == null) {
-      try {
-        lemmatizer = new SimpleLemmatizer(new FileInputStream(modelsDir
-            + "en-lemmatizer.dict"));
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
-    }
-
-    return lemmatizer;
-  }
-
-  public static NameFinderME getNameFinder() {
-    if (nameFinder == null) {
-      TokenNameFinderModel nameFinderModel;
-      try {
-        nameFinderModel = new TokenNameFinderModel(new FileInputStream(
-            modelsDir + "en-ner-person.bin"));
-        nameFinder = new NameFinderME(nameFinderModel);
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
-    }
-    return nameFinder;
-  }
-
-  public static POSTaggerME getTagger() {
-    if (tagger == null) {
-      POSModel posTaggerModel = new POSModelLoader().load(new File(modelsDir
-          + "en-pos-maxent.bin"));
-      tagger = new POSTaggerME(posTaggerModel);
-    }
-    return tagger;
-  }
-
-  public static SentenceDetectorME getSDetector() {
-    if (sdetector == null) {
-      try {
-        SentenceModel enSentModel = new SentenceModel(new FileInputStream(
-            modelsDir + "en-sent.bin"));
-        sdetector = new SentenceDetectorME(enSentModel);
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
-    }
-    return sdetector;
-  }
-
-  public static Tokenizer getTokenizer() {
-    if (tokenizer == null) {
-      try {
-        TokenizerModel tokenizerModel = new TokenizerModel(new FileInputStream(
-            modelsDir + "en-token.bin"));
-        tokenizer = new TokenizerME(tokenizerModel);
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
-
-    }
-    return tokenizer;
-  }
-
-  public static boolean isInitialized() {
-    return (dictionary != null && morph != null && stemCache != null
-        && stopCache != null && relvCache != null);
-  }
-
-  public void load() {
-    try {
-      SentenceModel enSentModel = new SentenceModel(new FileInputStream(
-          modelsDir + "en-sent.bin"));
-      sdetector = new SentenceDetectorME(enSentModel);
-
-      TokenizerModel TokenizerModel = new TokenizerModel(new FileInputStream(
-          modelsDir + "en-token.bin"));
-      tokenizer = new TokenizerME(TokenizerModel);
-
-      POSModel posTaggerModel = new POSModelLoader().load(new File(modelsDir
-          + "en-pos-maxent.bin"));
-      tagger = new POSTaggerME(posTaggerModel);
-
-      TokenNameFinderModel nameFinderModel = new TokenNameFinderModel(
-          new FileInputStream(modelsDir + "en-ner-person.bin"));
-      nameFinder = new NameFinderME(nameFinderModel);
-
-      lemmatizer = new SimpleLemmatizer(new FileInputStream(modelsDir
-          + "en-lemmatizer.dict"));
-
-      dictionary = Dictionary.getDefaultResourceInstance();
-      morph = dictionary.getMorphologicalProcessor();
-
-      // loading lookup caches
-      stemCache = new HashMap();
-      for (Object pos : POS.getAllPOS()) {
-        stemCache.put(((POS) pos).getKey(), new HashMap());
-      }
-
-      stopCache = new HashMap<String, Object>();
-      for (String s : Constants.stopWords) {
-        stopCache.put(s, null);
-      }
-
-      relvCache = new HashMap<String, Object>();
-      for (String t : Constants.relevantPOS) {
-        relvCache.put(t, null);
-      }
-
-      englishWords = new HashMap<String, Object>();
-
-      if (isInitialized()) {
-        Constants.print("loading was succesfull");
-      } else {
-        Constants.print("loading was unsuccesfull");
-      }
-
-    } catch (FileNotFoundException e) {
-      e.printStackTrace();
-    } catch (InvalidFormatException e) {
-      e.printStackTrace();
-    } catch (IOException e) {
-      e.printStackTrace();
-    } catch (JWNLException e) {
-      e.printStackTrace();
-    }
-  }
-
-  public static void unload() {
-    dictionary.close();
-  }
-
-}
+// TODO to be removed
\ No newline at end of file

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/PreProcessor.java
 Thu Aug 20 22:01:59 2015
@@ -1,195 +1 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-import net.sf.extjwnl.JWNLException;
-import net.sf.extjwnl.data.POS;
-import opennlp.tools.util.Span;
-
-public class PreProcessor {
-
-  public PreProcessor() {
-    super();
-  }
-
-  public static String[] split(String text) {
-    return Loader.getSDetector().sentDetect(text);
-  }
-
-  public static String[] tokenize(String sentence) {
-    return Loader.getTokenizer().tokenize(sentence);
-  }
-
-  public static String[] tag(String[] tokenizedSentence) {
-    return Loader.getTagger().tag(tokenizedSentence);
-  }
-
-  public static String lemmatize(String word, String posTag) {
-    return Loader.getLemmatizer().lemmatize(word, posTag);
-  }
-
-  public static boolean isName(String word) {
-    Span nameSpans[] = Loader.getNameFinder().find(new String[] { word });
-    return (nameSpans.length != 0);
-  }
-
-  public static ArrayList<WordPOS> getAllRelevantWords(String[] sentence) {
-
-    ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
-
-    String[] tags = tag(sentence);
-
-    for (int i = 0; i < sentence.length; i++) {
-      if (!Loader.getStopCache().containsKey(sentence[i])) {
-        if (Loader.getRelvCache().containsKey(tags[i])) {
-          relevantWords
-              .add(new WordPOS(sentence[i],tags[i]));
-        }
-
-      }
-    }
-    return relevantWords;
-  }
-
-  public static ArrayList<WordPOS> getAllRelevantWords(WordToDisambiguate 
word) {
-    ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
-
-    String[] tags = tag(word.getSentence());
-
-    for (int i = 0; i < word.getSentence().length; i++) {
-      if (!Loader.getStopCache().containsKey(word.getSentence()[i])) {
-        if (Loader.getRelvCache().containsKey(tags[i])) {
-          WordPOS wordpos = new WordPOS(word.getSentence()[i],tags[i]);
-          if(i == word.getWordIndex()){
-            wordpos.isTarget = true;
-          }
-          relevantWords
-              .add(wordpos);
-        }
-
-      }
-    }
-    return relevantWords;
-  }
-
-  public static ArrayList<WordPOS> getRelevantWords(WordToDisambiguate word,
-      int winBackward, int winForward) {
-
-    ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
-
-    String[] sentence = word.getSentence();
-    String[] tags = tag(sentence);
-
-    int index = word.getWordIndex();
-
-    for (int i = index - winBackward; i <= index + winForward; i++) {
-
-      if (i >= 0 && i < sentence.length && i != index) {
-        if (!Loader.getStopCache().containsKey(sentence[i])) {
-
-          if (Loader.getRelvCache().containsKey(tags[i])) {
-            relevantWords.add(new WordPOS(sentence[i], tags[i]));
-          }
-
-        }
-      }
-    }
-    return relevantWords;
-  }
-
-  /**
-   * Stem a single word with WordNet dictionnary
-   * 
-   * @param wordToStem
-   *          word to be stemmed
-   * @return stemmed list of words
-   */
-  public static List StemWordWithWordNet(WordPOS wordToStem) {
-    if (wordToStem == null)
-      return null;
-    ArrayList<String> stems = new ArrayList();
-    try {
-      for (Object pos : POS.getAllPOS()) {
-        stems.addAll(Loader.getMorph().lookupAllBaseForms((POS) pos,
-            wordToStem.getWord()));
-      }
-
-      if (stems.size() > 0)
-        return stems;
-      else {
-        return null;
-      }
-
-    } catch (JWNLException e) {
-      e.printStackTrace();
-    }
-    return null;
-  }
-
-  /**
-   * Stem a single word tries to look up the word in the stemCache HashMap If
-   * the word is not found it is stemmed with WordNet and put into stemCache
-   * 
-   * @param wordToStem
-   *          word to be stemmed
-   * @return stemmed word list, null means the word is incorrect
-   */
-  public static List Stem(WordPOS wordToStem) {
-
-    // check if we already cached the stem map
-    HashMap posMap = (HashMap) Loader.getStemCache().get(
-        wordToStem.getPOS().getKey());
-
-    // don't check words with digits in them
-    if (containsNumbers(wordToStem.getWord())) {
-      return null;
-    }
-
-    List stemList = (List) posMap.get(wordToStem.getWord());
-    if (stemList != null) { // return it if we already cached it
-      return stemList;
-
-    } else { // unCached list try to stem it
-      stemList = StemWordWithWordNet(wordToStem);
-      if (stemList != null) {
-        // word was recognized and stemmed with wordnet:
-        // add it to cache and return the stemmed list
-        posMap.put(wordToStem.getWord(), stemList);
-        Loader.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
-        return stemList;
-      } else { // could not be stemmed add it anyway (as incorrect with null
-               // list)
-        posMap.put(wordToStem.getWord(), null);
-        Loader.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
-        return null;
-      }
-    }
-  }
-
-  public static boolean containsNumbers(String word) {
-    // checks if the word is or contains a number
-    return word.matches(".*[0-9].*");
-  }
-
-}
+// TODO to be removed
\ No newline at end of file

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java
 Thu Aug 20 22:01:59 2015
@@ -60,8 +60,8 @@ public class WSDEvaluator extends Evalua
 
     // get the best predicted sense
     String predictedSense = disambiguator.disambiguate(reference.getSentence(),
-        reference.getTags(), reference.getTargetPosition(),
-        reference.getTargetLemma())[0];
+        reference.getTags(),
+        reference.getLemmas(), reference.getTargetPosition())[0];
 
     if (predictedSense == null) {
       System.out.println("There was no sense for : " + 
reference.getTargetWord());
@@ -94,8 +94,8 @@ public class WSDEvaluator extends Evalua
       }
     }
 
-    return new WSDSample(reference.getSentence(), reference.getTags(),
-        reference.getTargetPosition(), reference.getTargetLemma());
+    return new WSDSample(reference.getSentence(), reference.getTags(), 
reference.getLemmas(),
+        reference.getTargetPosition());
   }
 
   /**

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
 Thu Aug 20 22:01:59 2015
@@ -26,7 +26,6 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
-import java.util.List;
 
 import net.sf.extjwnl.JWNLException;
 import net.sf.extjwnl.data.POS;
@@ -57,6 +56,7 @@ public class WSDHelper {
   private static HashMap<String, Object> relvCache;
 
   private static HashMap<String, Object> englishWords;
+  private static HashMap<String, Object> nonRelevWordsDef;
 
   // List of all the PoS tags
   public static String[] allPOS = { "CC", "CD", "DT", "EX", "FW", "IN", "JJ",
@@ -191,6 +191,59 @@ public class WSDHelper {
     return englishWords;
   }
 
+  /**
+   * This initializes the Hashmap of non relevant words definitions, and 
returns
+   * the definition of the non relevant word based on its pos-tag
+   * 
+   * @param posTag
+   *          the pos-tag of the non relevant word
+   * @return the definition of the word
+   */
+  public static String getNonRelevWordsDef(String posTag) {
+    if (nonRelevWordsDef == null || nonRelevWordsDef.keySet().isEmpty()) {
+      nonRelevWordsDef = new HashMap<String, Object>();
+
+      nonRelevWordsDef.put("CC", "coordinating conjunction");
+      nonRelevWordsDef.put("CD", "cardinal number");
+      nonRelevWordsDef.put("DT", "determiner");
+      nonRelevWordsDef.put("EX", "existential there");
+      nonRelevWordsDef.put("FW", "foreign word");
+      nonRelevWordsDef.put("IN", "preposition / subordinating conjunction");
+      nonRelevWordsDef.put("JJ", "adjective");
+      nonRelevWordsDef.put("JJR", "adjective, comparative");
+      nonRelevWordsDef.put("JJS", "adjective, superlative");
+      nonRelevWordsDef.put("LS", "list marker");
+      nonRelevWordsDef.put("MD", "modal");
+      nonRelevWordsDef.put("NN", "noun, singular or mass");
+      nonRelevWordsDef.put("NNS", "noun plural");
+      nonRelevWordsDef.put("NNP", "proper noun, singular");
+      nonRelevWordsDef.put("NNPS", "proper noun, plural");
+      nonRelevWordsDef.put("PDT", "predeterminer");
+      nonRelevWordsDef.put("POS", "possessive ending");
+      nonRelevWordsDef.put("PRP", "personal pronoun");
+      nonRelevWordsDef.put("PRP$", "possessive pronoun");
+      nonRelevWordsDef.put("RB", "adverb");
+      nonRelevWordsDef.put("RBR", "adverb, comparative");
+      nonRelevWordsDef.put("RBS", "adverb, superlative");
+      nonRelevWordsDef.put("RP", "particle");
+      nonRelevWordsDef.put("SYM", "Symbol");
+      nonRelevWordsDef.put("TO", "to");
+      nonRelevWordsDef.put("UH", "interjection");
+      nonRelevWordsDef.put("VB", "verb, base form");
+      nonRelevWordsDef.put("VBD", "verb, past tense");
+      nonRelevWordsDef.put("VBG", "verb, gerund/present participle");
+      nonRelevWordsDef.put("VBN", "verb, past participle");
+      nonRelevWordsDef.put("VBP", "verb, sing. present, non-3d");
+      nonRelevWordsDef.put("VBZ", "verb, 3rd person sing. present");
+      nonRelevWordsDef.put("WDT", "wh-determiner");
+      nonRelevWordsDef.put("WP", "wh-pronoun");
+      nonRelevWordsDef.put("WP$", "possessive wh-pronoun");
+      nonRelevWordsDef.put("WRB", "wh-adverb");
+
+    }
+    return (String) nonRelevWordsDef.get(posTag);
+  }
+
   public static MorphologicalProcessor getMorph() {
     if (morph == null) {
       getDictionary();
@@ -281,35 +334,63 @@ public class WSDHelper {
         for (int i = 0; i < results.length; i++) {
           parts = results[i].split(" ");
           sensekey = parts[1];
-          score = Double.parseDouble(parts[2]);
-          try {
-            print("score : "
-                + score
-                + " for sense "
-                + i
-                + " : "
-                + sensekey
-                + " : "
-                + getDictionary().getWordBySenseKey(sensekey).getSynset()
-                    .getGloss());
-          } catch (JWNLException e) {
-            e.printStackTrace();
+          if (parts.length != 3) {
+            score = -1.0;
+          } else {
+            score = Double.parseDouble(parts[2]);
+          }
+          if (parts[0].equalsIgnoreCase(WSDParameters.SenseSource.WORDNET
+              .name())) {
+
+            try {
+              print("score : "
+                  + score
+                  + " for sense "
+                  + i
+                  + " : "
+                  + sensekey
+                  + " : "
+                  + getDictionary().getWordBySenseKey(sensekey).getSynset()
+                      .getGloss());
+
+            } catch (JWNLException e) {
+              e.printStackTrace();
+            }
+          } else {
+            if (parts[0].equalsIgnoreCase(WSDParameters.SenseSource.WSDHELPER
+                .name())) {
+
+              print("This word is a " + sensekey + " : "
+                  + WSDHelper.getNonRelevWordsDef(sensekey));
+
+            }
           }
         }
       } else {
         for (int i = 0; i < results.length; i++) {
           parts = results[i].split(" ");
           sensekey = parts[1];
-          try {
-            print("sense "
-                + i
-                + " : "
-                + sensekey
-                + " : "
-                + getDictionary().getWordBySenseKey(sensekey).getSynset()
-                    .getGloss());
-          } catch (JWNLException e) {
-            e.printStackTrace();
+
+          if (parts[0].equalsIgnoreCase(WSDParameters.SenseSource.WORDNET
+              .name())) {
+
+            try {
+              print("sense "
+                  + i
+                  + " : "
+                  + sensekey
+                  + " : "
+                  + getDictionary().getWordBySenseKey(sensekey).getSynset()
+                      .getGloss());
+            } catch (JWNLException e) {
+              e.printStackTrace();
+            }
+          } else if (parts[0]
+              .equalsIgnoreCase(WSDParameters.SenseSource.WSDHELPER.name())) {
+
+            print("This word is a " + sensekey + " : "
+                + WSDHelper.getNonRelevWordsDef(sensekey));
+
           }
         }
       }
@@ -443,7 +524,7 @@ public class WSDHelper {
    * @return whether a PoS Tag corresponds to a relevant Part of Speech (type
    *         {@link POS}) or not ( true} if it is, false} otherwise)
    */
-  public static boolean isRelevant(String posTag) {
+  public static boolean isRelevantPOSTag(String posTag) {
     return getPOS(posTag) != null;
   }
 
@@ -461,7 +542,7 @@ public class WSDHelper {
    *          The Part of Speech of Type {@link POS}
    * @return whether a Part of Speech is relevant (true) or not (false)
    */
-  public static boolean isRelevant(POS pos) {
+  public static boolean isRelevantPOS(POS pos) {
     return pos.equals(POS.ADJECTIVE) || pos.equals(POS.ADVERB)
         || pos.equals(POS.NOUN) || pos.equals(POS.VERB);
   }
@@ -547,51 +628,6 @@ public class WSDHelper {
     return relevantWords;
   }
 
-  public static ArrayList<WordPOS> getAllRelevantWords(WordToDisambiguate 
word) {
-    ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
-
-    String[] tags = WSDHelper.getTagger().tag(word.getSentence());
-
-    for (int i = 0; i < word.getSentence().length; i++) {
-      if (!WSDHelper.getStopCache().containsKey(word.getSentence()[i])) {
-        if (WSDHelper.getRelvCache().containsKey(tags[i])) {
-          WordPOS wordpos = new WordPOS(word.getSentence()[i], tags[i]);
-          if (i == word.getWordIndex()) {
-            wordpos.isTarget = true;
-          }
-          relevantWords.add(wordpos);
-        }
-
-      }
-    }
-    return relevantWords;
-  }
-
-  public static ArrayList<WordPOS> getRelevantWords(WordToDisambiguate word,
-      int winBackward, int winForward) {
-
-    ArrayList<WordPOS> relevantWords = new ArrayList<WordPOS>();
-
-    String[] sentence = word.getSentence();
-    String[] tags = WSDHelper.getTagger().tag(sentence);
-
-    int index = word.getWordIndex();
-
-    for (int i = index - winBackward; i <= index + winForward; i++) {
-
-      if (i >= 0 && i < sentence.length && i != index) {
-        if (!WSDHelper.getStopCache().containsKey(sentence[i])) {
-
-          if (WSDHelper.getRelvCache().containsKey(tags[i])) {
-            relevantWords.add(new WordPOS(sentence[i], tags[i]));
-          }
-
-        }
-      }
-    }
-    return relevantWords;
-  }
-
   /**
    * Stem a single word with WordNet dictionnary
    * 
@@ -630,7 +666,9 @@ public class WSDHelper {
    * @return stemmed word list, null means the word is incorrect
    */
   public static ArrayList<String> Stem(WordPOS wordToStem) {
-
+    if (wordToStem.getPOS() == null) {
+      WSDHelper.print("the word is " + wordToStem.getWord());
+    }
     // check if we already cached the stem map
     HashMap posMap = (HashMap) WSDHelper.getStemCache().get(
         wordToStem.getPOS().getKey());
@@ -653,9 +691,10 @@ public class WSDHelper {
         posMap.put(wordToStem.getWord(), stemList);
         WSDHelper.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
         return stemList;
-      } else { // could not be stemmed add it anyway (as incorrect with null
-               // list)
-        posMap.put(wordToStem.getWord(), null);
+      } else { // could not be stemmed add it anyway (as it is)
+        stemList = new ArrayList<String>();
+        stemList.add(wordToStem.getWord());
+        posMap.put(wordToStem.getWord(), stemList);
         WSDHelper.getStemCache().put(wordToStem.getPOS().getKey(), posMap);
         return null;
       }

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
 Thu Aug 20 22:01:59 2015
@@ -28,10 +28,17 @@ public abstract class WSDParameters {
   protected boolean isCoarseSense;
   public static boolean isStemCompare;
 
-  public static enum Source {
-    WORDNET
+  public static enum TrainingSource {
+    SEMCOR, SEMEVAL, OTHER
   }
 
+  public static enum SenseSource {
+    WORDNET, WSDHELPER, OTHER;
+  }
+
+  protected SenseSource senseSource;
+  protected TrainingSource trainingSource;
+
   /**
    * @return if the disambiguation type is coarse grained or fine grained
    */
@@ -43,6 +50,30 @@ public abstract class WSDParameters {
     this.isCoarseSense = isCoarseSense;
   }
 
+  public static boolean isStemCompare() {
+    return isStemCompare;
+  }
+
+  public static void setStemCompare(boolean isStemCompare) {
+    WSDParameters.isStemCompare = isStemCompare;
+  }
+
+  public SenseSource getSenseSource() {
+    return senseSource;
+  }
+
+  public void setSenseSource(SenseSource senseSource) {
+    this.senseSource = senseSource;
+  }
+
+  public TrainingSource getTrainingSource() {
+    return trainingSource;
+  }
+
+  public void setTrainingSource(TrainingSource trainingSource) {
+    this.trainingSource = trainingSource;
+  }
+
   public WSDParameters() {
     this.isCoarseSense = true;
   }

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java
 Thu Aug 20 22:01:59 2015
@@ -32,69 +32,51 @@ public class WSDSample {
 
   private List<String> sentence;
   private List<String> tags;
+  private List<String> lemmas;
   private int senseID;
   private List<String> senseIDs;
   private int targetPosition;
-  private String targetLemma;
 
-  public WSDSample(String sentence[], String tags[], int targetPosition,
-      String targetLemma, int senseID) {
+  public WSDSample(String sentence[], String tags[], String[] lemmas,
+      int targetPosition, int senseID) {
     this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(sentence)));
     this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(tags)));
     this.targetPosition = targetPosition;
-    this.targetLemma = targetLemma;
+    this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
+        .asList(lemmas)));
+    ;
     this.senseID = senseID;
     checkArguments();
   }
 
-  public WSDSample(String sentence[], String tags[], int targetPosition,
-      String targetLemma, String senseIDs[]) {
+  public WSDSample(String sentence[], String tags[], String[] lemmas,
+      int targetPosition) {
     this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(sentence)));
     this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(tags)));
     this.targetPosition = targetPosition;
-    this.targetLemma = targetLemma;
-    this.senseIDs = Collections.unmodifiableList(new ArrayList<String>(Arrays
-        .asList(senseIDs)));
+    this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
+        .asList(lemmas)));
     ;
     checkArguments();
   }
-
-  public WSDSample(List<String> sentence, List<String> tags,
-      int targetPosition, String targetLemma, int senseID) {
-    this.sentence = Collections
-        .unmodifiableList(new ArrayList<String>(sentence));
-    this.tags = Collections.unmodifiableList(new ArrayList<String>(tags));
-    this.targetPosition = targetPosition;
-    this.targetLemma = targetLemma;
-    this.senseID = senseID;
-    checkArguments();
-  }
-
-  public WSDSample(List<String> sentence, List<String> tags,
-      int targetPosition, String targetLemma, List<String> senseIDs) {
-    this.sentence = Collections
-        .unmodifiableList(new ArrayList<String>(sentence));
-    this.tags = Collections.unmodifiableList(new ArrayList<String>(tags));
+  
+  public WSDSample(String sentence[], String tags[], String[] lemmas,
+      int targetPosition, List<String> senseIDs) {
+    this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
+        .asList(sentence)));
+    this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
+        .asList(tags)));
     this.targetPosition = targetPosition;
-    this.targetLemma = targetLemma;
+    this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
+        .asList(lemmas)));
     this.senseIDs = senseIDs;
     checkArguments();
   }
 
-  public WSDSample(String sentence[], String tags[], int targetPosition,
-      String targetLemma) {
-    this(sentence, tags, targetPosition, targetLemma, -1);
-  }
-
-  public WSDSample(List<String> sentence, List<String> tags,
-      int targetPosition, String targetLemma) {
-    this(sentence, tags, targetPosition, targetLemma, -1);
-  }
-
   private void checkArguments() {
     if (sentence.size() != tags.size() || targetPosition < 0
         || targetPosition >= tags.size())
@@ -113,6 +95,10 @@ public class WSDSample {
     return tags.toArray(new String[tags.size()]);
   }
 
+  public String[] getLemmas() {
+    return lemmas.toArray(new String[lemmas.size()]);
+  }
+
   public int getTargetPosition() {
     return targetPosition;
   }
@@ -133,10 +119,6 @@ public class WSDSample {
     return tags.get(targetPosition);
   }
 
-  public String getTargetLemma() {
-    return targetLemma;
-  }
-
   public void setSentence(List<String> sentence) {
     this.sentence = sentence;
   }
@@ -145,6 +127,10 @@ public class WSDSample {
     this.tags = tags;
   }
 
+  public void setLemmas(List<String> lemmas) {
+    this.lemmas = lemmas;
+  }
+
   public void setSenseID(int senseID) {
     this.senseID = senseID;
   }
@@ -157,10 +143,6 @@ public class WSDSample {
     this.targetPosition = targetPosition;
   }
 
-  public void setTargetLemma(String targetLemma) {
-    this.targetLemma = targetLemma;
-  }
-
   @Override
   public String toString() {
 
@@ -192,11 +174,11 @@ public class WSDSample {
     String tokenTags[] = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);
 
     int position = Integer.parseInt(tokenTags[0]);
-    String lemma = tokenTags[1];
-    String sentence[] = new String[tokenTags.length - 2];
-    String tags[] = new String[tokenTags.length - 2];
+    String sentence[] = new String[tokenTags.length - 1];
+    String tags[] = new String[tokenTags.length - 1];
+    String lemmas[] = new String[tokenTags.length - 1];
 
-    for (int i = 2; i < tokenTags.length; i++) {
+    for (int i = 1; i < tokenTags.length; i++) {
       int split = tokenTags[i].lastIndexOf("_");
 
       if (split == -1) {
@@ -205,9 +187,10 @@ public class WSDSample {
 
       sentence[i] = tokenTags[i].substring(0, split);
       tags[i] = tokenTags[i].substring(split + 1);
+      lemmas[i] = tokenTags[i].substring(split + 2);
     }
 
-    return new WSDSample(sentence, tags, position, lemma);
+    return new WSDSample(sentence, tags, lemmas, position);
   }
 
   @Override

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
 Thu Aug 20 22:01:59 2015
@@ -20,7 +20,10 @@
 package opennlp.tools.disambiguator;
 
 import java.security.InvalidParameterException;
+import java.util.ArrayList;
+import java.util.List;
 
+import opennlp.tools.disambiguator.ims.IMSParameters;
 import opennlp.tools.util.Span;
 
 /**
@@ -45,46 +48,114 @@ import opennlp.tools.util.Span;
  * @see Lesk
  * @see IMS
  */
-public interface WSDisambiguator {
+public abstract class WSDisambiguator {
 
   /**
    * @return the parameters of the disambiguation algorithm
    */
-  public WSDParameters getParams();
+  public abstract WSDParameters getParams();
 
   /**
    * @param the
    *          disambiguation implementation specific parameters.
    * @throws InvalidParameterException
    */
-  public void setParams(WSDParameters params) throws InvalidParameterException;
+  public abstract void setParams(WSDParameters params) throws 
InvalidParameterException;
 
   /**
    * @param tokenizedContext
    * @param tokenTags 
+   * @param lemmas
    * @param ambiguousTokenIndex
-   * @param ambiguousTokenLemma
    * @return result as an array of WordNet IDs
    */
-  public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,
-      int ambiguousTokenIndex, String ambiguousTokenLemma);
+  public abstract String[] disambiguate(String[] tokenizedContext, String[] 
tokenTags, String[] lemmas,
+      int ambiguousTokenIndex);
 
-  /**
+  /** The disambiguation method for all the words in a Span
    * @param tokenizedContext
    * @param tokenTags
    * @param ambiguousTokenIndexSpan
    * @param ambiguousTokenLemma
    * @return result as an array of WordNet IDs
    */
-  public String[][] disambiguate(String[] tokenizedContext, String[] tokenTags,
-      Span ambiguousTokenIndexSpan, String ambiguousTokenLemma);
+  public List<String[]> disambiguate(String[] tokenizedContext, String[] 
tokenTags, String[] lemmas,
+      Span ambiguousTokenIndexSpan){
+    List<String[]> senses = new ArrayList<String[]>();
+
+    int start = Math.max(0, ambiguousTokenIndexSpan.getStart());
+    
+    int end = Math.max(start,Math.min(tokenizedContext.length, 
ambiguousTokenIndexSpan.getEnd()));
+
+
+    for (int i = start; i < end + 1; i++) {
+
+      if (WSDHelper.isRelevantPOSTag(tokenTags[i])) {
+        WSDSample sample = new WSDSample(tokenizedContext, tokenTags, lemmas, 
i);
+        String[] sense = disambiguate(sample);
+        senses.add(sense);
+      } else {
+
+        if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) {
+          String s = WSDParameters.SenseSource.WSDHELPER.name() + " " 
+              + WSDHelper.getNonRelevWordsDef(tokenTags[i]);
+          String[] sense = { s };
+
+          senses.add(sense);
+        } else {
+          senses.add(null);
+        }
+      }
+
+    }
+
+    return senses;
+  }
+  
+  /**
+   * The disambiguation method for all the words of the context
+   * 
+   * @param tokenizedContext
+   *          : the text containing the word to disambiguate
+   * @param tokenTags
+   *          : the tags corresponding to the context
+   * @param lemmas
+   *          : the lemmas of ALL the words in the context
+   * @return a List of arrays, each corresponding to the senses of each word of
+   *         the context which are to be disambiguated
+   */
+  public List<String[]> disambiguate(String[] tokenizedContext,
+      String[] tokenTags, String[] lemmas) {
+
+    List<String[]> senses = new ArrayList<String[]>();
+
+    for (int i = 0; i < tokenizedContext.length; i++) {
+
+      if (WSDHelper.isRelevantPOSTag(tokenTags[i])) {
+        WSDSample sample = new WSDSample(tokenizedContext, tokenTags, lemmas, 
i);
+        String[] sense = disambiguate(sample);
+        senses.add(sense);
+      } else {
+
+        if (WSDHelper.getNonRelevWordsDef(tokenTags[i]) != null) {
+          String s = IMSParameters.SenseSource.WSDHELPER.name() + " " + 
tokenTags[i];
+          String[] sense = { s };
+
+          senses.add(sense);
+        } else {
+          senses.add(null);
+        }
+      }
+
+    }
+
+    return senses;
+  }
   
   /**
    * @param WSDSample
    * @return result as an array of WordNet IDs
    */
-  public String[] disambiguate(WSDSample sample);
+  public abstract String[] disambiguate(WSDSample sample);
   
-  @Deprecated
-  public String[] disambiguate(String[] inputText, int inputWordIndex);
 }
\ No newline at end of file

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java?rev=1696865&r1=1696864&r2=1696865&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WordPOS.java
 Thu Aug 20 22:01:59 2015
@@ -33,10 +33,9 @@ import net.sf.extjwnl.data.Synset;
 public class WordPOS {
 
   private String word;
-  private List stems;
+  private List<String> stems;
   private POS pos;
   private String posTag;
-  private int wordIndex;
   public boolean isTarget = false;
 
   public WordPOS(String word, String tag) throws IllegalArgumentException {
@@ -68,7 +67,7 @@ public class WordPOS {
     return posTag;
   }
 
-  public List getStems() {
+  public List<String> getStems() {
     if (stems == null) {
       return WSDHelper.Stem(this);
     } else {
@@ -99,8 +98,8 @@ public class WordPOS {
   // uses Stemming to check if two words are equivalent
   public boolean isStemEquivalent(WordPOS wordToCompare) {
     // check if there is intersection in the stems;
-    List originalList = this.getStems();
-    List listToCompare = wordToCompare.getStems();
+    List<String> originalList = this.getStems();
+    List<String> listToCompare = wordToCompare.getStems();
 
     if (originalList == null || listToCompare == null) {
       return false;
@@ -118,16 +117,4 @@ public class WordPOS {
 
   }
 
-  // uses Lemma to check if two words are equivalent
-  public boolean isLemmaEquivalent(WordPOS wordToCompare) {
-
-    ArrayList<String> lemmas_word = new ArrayList();
-    ArrayList<String> lemmas_wordToCompare = new ArrayList();
-
-    for (String pos : WSDHelper.allPOS) {
-      WSDHelper.getLemmatizer().lemmatize(wordToCompare.getWord(), pos);
-    }
-    return false;
-  }
-
 }

svn commit: r1696865 [1/2] - in /opennlp/sandbox/opennlp-wsd/src: main/java/opennlp/tools/disambiguator/ main/java/opennlp/tools/disambiguator/contextclustering/ main/java/opennlp/tools/disambiguator/datareader/ main/java/opennlp/tools/disambiguator/im...

Reply via email to