Author: joern
Date: Wed Aug 26 15:56:53 2015
New Revision: 1697959

URL: http://svn.apache.org/r1697959
Log:
OPENNLP-791 WordNet based clusters patch, uses ME for now will have to modify 
for other classifiers. Thanks to Anthony Beylerian for providing a patch!

Added:
    opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
   (with props)
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
   (with props)
Modified:
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
    
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
    
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java?rev=1697959&r1=1697958&r2=1697959&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
 Wed Aug 26 15:56:53 2015
@@ -62,7 +62,7 @@ public abstract class WSDParameters {
   }
 
   public WSDParameters() {
-    this.isCoarseSense = true;
+    this.isCoarseSense = false;
   }
 
   /**

Modified: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java?rev=1697959&r1=1697958&r2=1697959&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
 Wed Aug 26 15:56:53 2015
@@ -75,8 +75,11 @@ public abstract class WSDisambiguator {
    * @param ambiguousTokenIndex
    * @return result as an array of WordNet IDs
    */
-  public abstract String[] disambiguate(String[] tokenizedContext,
-      String[] tokenTags, String[] lemmas, int ambiguousTokenIndex);
+  public String[] disambiguate(String[] tokenizedContext,
+      String[] tokenTags, String[] lemmas, int ambiguousTokenIndex){
+         return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,
+                       ambiguousTokenIndex));
+  }
 
   /**
    * The disambiguation method for all the words in a Span

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java?rev=1697959&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
 Wed Aug 26 15:56:53 2015
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import net.sf.extjwnl.data.Synset;
+import opennlp.tools.disambiguator.WSDHelper;
+import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.disambiguator.WordPOS;
+
+/**
+ * The default Context Generator of IMS
+ */
+public class DefaultOSCCContextGenerator implements OSCCContextGenerator {
+
+  public DefaultOSCCContextGenerator() {
+  }
+
+  public String[] extractSurroundingContextClusters(int index, String[] toks,
+      String[] tags, String[] lemmas, int windowSize) {
+
+    ArrayList<String> contextClusters = new ArrayList<String>();
+
+    for (int i = 0; i < toks.length; i++) {
+      if (lemmas != null) {
+
+        if (!WSDHelper.stopWords.contains(toks[i].toLowerCase())
+            && (index != i)) {
+
+          String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")
+              .trim();
+          
+          WordPOS word = new WordPOS(lemma, tags[i]);
+
+          // TODO check fix for "_" and null pointers
+          if (lemma.length() > 1 && !lemma.contains("_")) {
+            try{
+            ArrayList<Synset> synsets = word.getSynsets();
+            if (synsets!=null && synsets.size() > 0 ){
+              contextClusters.add(synsets.get(0).getOffset() + "");
+            }
+            }catch(NullPointerException ex)
+            {
+              //TODO tagger mistake add proper exception
+            }
+          }
+
+        }
+      }
+    }
+
+    return contextClusters.toArray(new String[contextClusters.size()]);
+
+  }
+
+  /**
+   * Get Context of a word To disambiguate
+   * 
+   * @return The OSCC context of the word to disambiguate
+   */
+  @Override
+  public String[] getContext(int index, String[] toks, String[] tags,
+      String[] lemmas, int windowSize) {
+
+    HashSet<String> surroundingContextClusters = new HashSet<>();
+    surroundingContextClusters.addAll(Arrays
+        .asList(extractSurroundingContextClusters(index, toks, tags, lemmas,
+            windowSize)));
+
+    String[] serializedFeatures = new 
String[surroundingContextClusters.size()];
+
+    int i = 0;
+
+    for (String feature : surroundingContextClusters) {
+      serializedFeatures[i] = "F" + i + "=" + feature;
+      i++;
+    }
+
+    return serializedFeatures;
+
+  }
+
+  public String[] getContext(WSDSample sample, int windowSize) {
+
+    return getContext(sample.getTargetPosition(), sample.getSentence(),
+        sample.getTags(), sample.getLemmas(), windowSize);
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java?rev=1697959&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
 Wed Aug 26 15:56:53 2015
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import opennlp.tools.disambiguator.WSDSample;
+
+/**
+ * Interface for {@link OSCCME} context generators.
+ */
+public interface OSCCContextGenerator {
+
+  String[] getContext(int index, String[] toks, String[] tags, String[] lemmas,
+    int windowSize);
+
+  String[] getContext(WSDSample sample, int windowSize);
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java?rev=1697959&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
 Wed Aug 26 15:56:53 2015
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+public class OSCCFactory extends BaseToolFactory {
+
+  /**
+   * Creates a {@link OSCCFactory} that provides the default implementation of
+   * the resources.
+   * */
+  public OSCCFactory() {
+
+  }
+
+  public static OSCCFactory create(String subclassName)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new OSCCFactory();
+    }
+    try {
+      OSCCFactory theFactory = ExtensionLoader.instantiateExtension(
+          OSCCFactory.class, subclassName);
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      System.err.println(msg);
+      e.printStackTrace();
+      throw new InvalidFormatException(msg, e);
+    }
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // no additional artifacts
+  }
+
+  public OSCCContextGenerator getContextGenerator() {
+    return new DefaultOSCCContextGenerator();
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java?rev=1697959&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
 Wed Aug 26 15:56:53 2015
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.Assert;
+import opennlp.tools.disambiguator.WSDHelper;
+import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.mfs.MFS;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.TrainingParameters;
+
+public class OSCCME extends WSDisambiguator {
+
+  protected OSCCModel osccModel;
+
+  protected static OSCCContextGenerator cg = new DefaultOSCCContextGenerator();
+
+  public OSCCME(OSCCParameters params) {
+    this.params = params;
+  }
+
+  public OSCCME(OSCCModel model, OSCCParameters params) {
+    this.osccModel = osccModel;
+    this.params = params;
+
+    Assert.assertEquals(model.getWindowSize(), params.getWindowSize());
+  }
+
+  public void setModel(OSCCModel model) {
+    this.osccModel = model;
+  }
+
+  public void setParameters(OSCCParameters parameters) {
+    this.params = parameters;
+  }
+
+  public static OSCCModel train(String lang, ObjectStream<WSDSample> samples,
+      TrainingParameters mlParams, OSCCParameters osccParams,
+      OSCCFactory imsfactory) throws IOException {
+
+    HashMap<String, String> manifestInfoEntries = new HashMap<String, 
String>();
+
+    MaxentModel osccModel = null;
+
+    ArrayList<Event> events = new ArrayList<Event>();
+    ObjectStream<Event> es = null;
+
+    WSDSample sample = samples.read();
+    String wordTag = "";
+    if (sample != null) {
+      wordTag = sample.getTargetWordTag();
+      do {
+
+        String sense = sample.getSenseIDs().get(0);
+
+        String[] context = cg.getContext(sample, osccParams.windowSize);
+        Event ev = new Event(sense + "", context);
+
+        events.add(ev);
+
+        es = ObjectStreamUtils.createObjectStream(events);
+
+      } while ((sample = samples.read()) != null);
+    }
+
+    EventTrainer trainer = TrainerFactory.getEventTrainer(
+        mlParams.getSettings(), manifestInfoEntries);
+    osccModel = trainer.train(es);
+
+    return new OSCCModel(lang, wordTag, osccParams.windowSize, osccModel, 
manifestInfoEntries, imsfactory);
+  }
+
+
+  @Override
+  public String[] disambiguate(WSDSample sample) {
+    if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
+      String wordTag = sample.getTargetWordTag();
+
+      String trainingFile = ((OSCCParameters) this.getParams())
+          .getTrainingDataDirectory() + sample.getTargetWordTag();
+
+      if (osccModel == null
+          || !osccModel.getWordTag().equals(sample.getTargetWordTag())) {
+
+        File file = new File(trainingFile + ".ims.model");
+        if (file.exists() && !file.isDirectory()) {
+          try {
+            setModel(new OSCCModel(file));
+
+          } catch (InvalidFormatException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+          } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+          }
+
+          String outcome = "";
+
+          String[] context = cg.getContext(sample,
+              ((OSCCParameters) this.params).windowSize);
+
+          double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);
+          outcome = 
osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);
+
+          if (outcome != null && !outcome.equals("")) {
+
+            outcome = this.getParams().getSenseSource().name() + " "
+                + wordTag.split("\\.")[0] + "%" + outcome;
+
+            String[] s = { outcome };
+
+            return s;
+          } else {
+            MFS mfs = new MFS();
+            return mfs.disambiguate(wordTag);
+          }
+
+        } else {
+
+          MFS mfs = new MFS();
+          return mfs.disambiguate(wordTag);
+        }
+      } else {
+        String outcome = "";
+
+        String[] context = cg.getContext(sample,
+            ((OSCCParameters) this.params).windowSize);
+
+        double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);
+        outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);
+
+        if (outcome != null && !outcome.equals("")) {
+
+          outcome = this.getParams().getSenseSource().name() + " "
+              + wordTag.split("\\.")[0] + "%" + outcome;
+
+          String[] s = { outcome };
+
+          return s;
+        } else {
+
+          MFS mfs = new MFS();
+          return mfs.disambiguate(wordTag);
+        }
+      }
+    } else {
+
+      if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
+        String s = OSCCParameters.SenseSource.WSDHELPER.name() + " "
+            + sample.getTargetTag();
+        String[] sense = { s };
+        return sense;
+      } else {
+        return null;
+      }
+
+    }
+
+  }
+
+  /**
+   * The IMS disambiguation method for a single word
+   * 
+   * @param tokenizedContext
+   *          : the text containing the word to disambiguate
+   * @param tokenTags
+   *          : the tags corresponding to the context
+   * @param lemmas
+   *          : the lemmas of ALL the words in the context
+   * @param index
+   *          : the index of the word to disambiguate
+   * @return an array of the senses of the word to disambiguate
+   */
+  public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,
+      String[] lemmas, int index) {
+    return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,
+        index));
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java?rev=1697959&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
 Wed Aug 26 15:56:53 2015
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.Properties;
+import java.net.URL;
+
+import org.apache.commons.lang3.StringUtils;
+
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+public class OSCCModel extends BaseModel {
+
+  private static final String COMPONENT_NAME = "OSCCME";
+  private static final String OSCC_MODEL_ENTRY_NAME = "OSCC.model";
+
+  private static final String WORDTAG = "wordtag";
+  private static final String WINSIZE = "winsize";
+  private static final String CONTEXTCLUSTERS = "contextclusters";
+
+  //private ArrayList<String> contextClusters = new ArrayList<String>();
+  private String wordTag;
+  private int windowSize;
+
+  /*public ArrayList<String> getContextClusters() {
+    return contextClusters;
+  }*/
+
+  public int getWindowSize() {
+    return windowSize;
+  }
+
+  public void setWindowSize(int windowSize) {
+    this.windowSize = windowSize;
+  }
+
+ /* public void setContextClusters(ArrayList<String> contextClusters) {
+    this.contextClusters = contextClusters;
+  }*/
+
+  public String getWordTag() {
+    return wordTag;
+  }
+
+  public void setWordTag(String wordTag) {
+    this.wordTag = wordTag;
+  }
+
+   public OSCCModel(String languageCode, String wordTag, int windowSize,
+   MaxentModel osccModel,
+      Map<String, String> manifestInfoEntries, OSCCFactory factory) {
+    super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+
+    artifactMap.put(OSCC_MODEL_ENTRY_NAME, osccModel);
+    this.setManifestProperty(WORDTAG, wordTag);
+    this.setManifestProperty(WINSIZE, windowSize + "");
+    
+//    this.setManifestProperty(CONTEXTCLUSTERS,
+//        StringUtils.join(contextClusters, ","));
+
+    //this.contextClusters = contextClusters;
+    checkArtifactMap();
+  }
+
+  public OSCCModel(String languageCode, String wordTag, int windowSize,
+      int ngram, MaxentModel osccModel, 
+      OSCCFactory factory) {
+    this(languageCode, wordTag, windowSize, osccModel,
+        null, factory);
+  }
+
+  public OSCCModel(InputStream in) throws IOException, InvalidFormatException {
+    super(COMPONENT_NAME, in);
+    updateAttributes();
+  }
+
+  public OSCCModel(File modelFile) throws IOException, InvalidFormatException {
+    super(COMPONENT_NAME, modelFile);
+    updateAttributes();
+  }
+
+  public OSCCModel(URL modelURL) throws IOException, InvalidFormatException {
+    super(COMPONENT_NAME, modelURL);
+    updateAttributes();
+  }
+
+  // path must include the word.tag i.e. : write.v
+  public boolean writeModel(String path) {
+    File outFile = new File(path + ".oscc.model");
+    CmdLineUtil.writeModel("oscc model", outFile, this);
+    return true;
+  }
+
+  @Override
+  protected void validateArtifactMap() throws InvalidFormatException {
+    super.validateArtifactMap();
+
+    if (!(artifactMap.get(OSCC_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+      throw new InvalidFormatException("OSCC model is incomplete!");
+    }
+  }
+
+  public MaxentModel getOSCCMaxentModel() {
+    if (artifactMap.get(OSCC_MODEL_ENTRY_NAME) instanceof MaxentModel) {
+      return (MaxentModel) artifactMap.get(OSCC_MODEL_ENTRY_NAME);
+    } else {
+      return null;
+    }
+  }
+
+  public void updateAttributes() {
+    Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
+    //String contextClusters = (String) manifest.get(CONTEXTCLUSTERS);
+
+   /* this.contextClusters = new ArrayList(
+        Arrays.asList(contextClusters.split(",")));*/
+    this.wordTag = (String) manifest.get(WORDTAG);
+    this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));
+  }
+
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return OSCCFactory.class;
+  }
+
+  public OSCCFactory getFactory() {
+    return (OSCCFactory) this.toolFactory;
+  }
+
+}
\ No newline at end of file

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java?rev=1697959&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
 Wed Aug 26 15:56:53 2015
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import java.io.File;
+
+import opennlp.tools.disambiguator.WSDParameters;
+
+/**
+ * This class contains the parameters for the OSCC approach as well as the
+ * directories containing the files used
+ */
+public class OSCCParameters extends WSDParameters {
+
+  protected String languageCode;
+  protected int windowSize;
+  protected String trainingDataDirectory;
+
+  protected static final int DFLT_WIN_SIZE = 3;
+  protected static final String DFLT_LANG_CODE = "En";
+  protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;
+
+  /**
+   * This constructor takes only two parameters. The default language used is
+   * <i>English</i>
+   * 
+   * @param windowSize
+   *          the size of the window used for the extraction of the features
+   *          qualified of Surrounding Context Clusters
+   * 
+   * @param source
+   *          the source of the training data
+   */
+  public OSCCParameters(int windowSize, SenseSource senseSource,
+      String trainingDataDirectory) {
+    this.languageCode = DFLT_LANG_CODE;
+    this.windowSize = windowSize;
+    this.senseSource = senseSource;
+    this.trainingDataDirectory = trainingDataDirectory;
+    this.isCoarseSense = false;
+
+    File folder = new File(trainingDataDirectory);
+    if (!folder.exists())
+      folder.mkdirs();
+  }
+
+  public OSCCParameters(String trainingDataDirectory) {
+    this(DFLT_WIN_SIZE, DFLT_SOURCE, trainingDataDirectory);
+
+    File folder = new File(trainingDataDirectory);
+    if (!folder.exists())
+      folder.mkdirs();
+  }
+
+  public OSCCParameters() {
+    // TODO change the "" into null ??
+    this(DFLT_WIN_SIZE, DFLT_SOURCE, "");
+  }
+
+  public OSCCParameters(int windowSize) {
+    // TODO change the "" into null ??
+    this(windowSize, DFLT_SOURCE, "");
+  }
+
+  public String getLanguageCode() {
+    return languageCode;
+  }
+
+  public void setLanguageCode(String languageCode) {
+    this.languageCode = languageCode;
+  }
+
+  public int getWindowSize() {
+    return windowSize;
+  }
+
+  public void setWindowSize(int windowSize) {
+    this.windowSize = windowSize;
+  }
+
+  public OSCCContextGenerator createContextGenerator() {
+
+    return new DefaultOSCCContextGenerator();
+  }
+
+  public String getTrainingDataDirectory() {
+    return trainingDataDirectory;
+  }
+
+  public void setTrainingDataDirectory(String trainingDataDirectory) {
+    this.trainingDataDirectory = trainingDataDirectory;
+  }
+
+  @Override
+  public boolean isValid() {
+    // TODO make validity check
+    return true;
+  }
+
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java?rev=1697959&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
 Wed Aug 26 15:56:53 2015
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.datareader.SensevalReader;
+import opennlp.tools.disambiguator.oscc.OSCCME;
+import opennlp.tools.disambiguator.oscc.OSCCParameters;
+
+import org.junit.Test;
+
+public class OSCCEvaluatorTest {
+
+  static SensevalReader seReader = new SensevalReader();
+
+  @Test
+  public static void main(String[] args) {
+    
+    
+    WSDHelper.print("Evaluation Started");
+    
+    // TODO write unit test
+    String modelsDir = "src\\test\\resources\\models\\";
+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+    OSCCParameters OSCCParams = new OSCCParameters("");
+    OSCCME oscc = new OSCCME(OSCCParams);
+
+    ArrayList<String> words = seReader.getSensevalWords();
+
+    for (String word : words) {
+      WSDEvaluator evaluator = new WSDEvaluator(oscc);
+
+      // don't take verbs because they are not from WordNet
+      if (!word.split("\\.")[1].equals("v")) {
+
+        ArrayList<WSDSample> instances = seReader.getSensevalData(word);
+        if (instances != null) {
+          WSDHelper.print("------------------" + word + "------------------");
+          for (WSDSample instance : instances) {
+            if (instance.getSenseIDs() != null
+                && !instance.getSenseIDs().get(0).equals("null")) {
+              evaluator.evaluateSample(instance);
+            }
+          }
+          WSDHelper.print(evaluator.toString());
+        } else {
+          WSDHelper.print("null instances");
+        }
+      }
+
+    }
+
+  }
+}

Propchange: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java?rev=1697959&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
 (added)
+++ 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
 Wed Aug 26 15:56:53 2015
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.oscc.OSCCFactory;
+import opennlp.tools.disambiguator.oscc.OSCCME;
+import opennlp.tools.disambiguator.oscc.OSCCModel;
+import opennlp.tools.disambiguator.oscc.OSCCParameters;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+public class OSCCTester {
+
+  public static void main(String[] args) {
+
+    SemcorReaderExtended sr = new SemcorReaderExtended();
+
+    String modelsDir = "src\\test\\resources\\models\\";
+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+    String test = "write.v";
+    TrainingParameters trainingParams = new TrainingParameters();
+    OSCCParameters OSCCParams = new OSCCParameters("");
+    OSCCFactory OSCCFactory = new OSCCFactory();
+
+    ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
+
+    OSCCModel model = null;
+    OSCCModel readModel = null;
+    try {
+      model = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,
+          OSCCFactory);
+      model.writeModel(test);
+      File outFile = new File(test + ".OSCC.model");
+      readModel = new OSCCModel(outFile);
+
+    } catch (IOException e1) {
+      // TODO Auto-generated catch block
+      e1.printStackTrace();
+    }
+    OSCCME OSCC = new OSCCME(readModel, OSCCParams);
+
+    /**
+     * This is how to make the context for one-word-disambiguation using OSCC
+     */
+    String test1 = "We need to discuss important topic, please write to me 
soon.";
+    String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+    String[] tags1 = WSDHelper.getTagger().tag(sentence1);
+    List<String> tempLemmas1 = new ArrayList<String>();
+    for (int i = 0; i < sentence1.length; i++) {
+      String lemma = WSDHelper.getLemmatizer()
+          .lemmatize(sentence1[i], tags1[i]);
+      tempLemmas1.add(lemma);
+    }
+    String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
+
+    // output
+    String[] senses1 = OSCC.disambiguate(sentence1, tags1, lemmas1, 8);
+    System.out.print(lemmas1[8] + " :\t");
+    WSDHelper.print(senses1);
+    WSDHelper.print("*****************************");
+
+    /**
+     * This is how to make the context for disambiguation of span of words
+     */
+    String test2 = "The component was highly radioactive to the point that"
+        + " it has been activated the second it touched water";
+    String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+    String[] tags2 = WSDHelper.getTagger().tag(sentence2);
+    List<String> tempLemmas2 = new ArrayList<String>();
+    for (int i = 0; i < sentence2.length; i++) {
+      String lemma = WSDHelper.getLemmatizer()
+          .lemmatize(sentence2[i], tags2[i]);
+      tempLemmas2.add(lemma);
+    }
+    String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
+    Span span = new Span(3, 7);
+
+    // output
+    List<String[]> senses2 = OSCC.disambiguate(sentence2, tags2, lemmas2, 
span);
+    for (int i = span.getStart(); i < span.getEnd() + 1; i++) {
+      String[] senses = senses2.get(i - span.getStart());
+      System.out.print(lemmas2[i] + " :\t");
+      WSDHelper.print(senses);
+      WSDHelper.print("----------");
+    }
+
+    WSDHelper.print("*****************************");
+  }
+}
\ No newline at end of file

Propchange: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java?rev=1697959&r1=1697958&r2=1697959&view=diff
==============================================================================
--- 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
 (original)
+++ 
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
 Wed Aug 26 15:56:53 2015
@@ -1,39 +1,36 @@
 package opennlp.tools.disambiguator;
 
-import java.util.ArrayList;
-import java.util.List;
 
-import opennlp.tools.disambiguator.ims.IMS;
 
 public class Tester {
 
   public static void main(String[] args) {
-
-    String modelsDir = "src\\test\\resources\\models\\";
-    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
-    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
-    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
-    IMS ims = new IMS();
-
-    String test3 = "The summer is almost over and I haven't been to the beach 
even once";
-    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
-    List<String> tempLemmas3 = new ArrayList<String>();
-    for (int i = 0; i < sentence3.length; i++) {
-      String lemma = WSDHelper.getLemmatizer()
-          .lemmatize(sentence3[i], tags3[i]);
-      tempLemmas3.add(lemma);
-    }
-    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
-    // output
-    List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
-    for (int i = 0; i < sentence3.length; i++) {
-      System.out.print(sentence3[i] + " : ");
-      WSDHelper.printResults(ims, senses3.get(i));
-      WSDHelper.print("----------");
-    }
+//
+//    String modelsDir = "src\\test\\resources\\models\\";
+//    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+//    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+//    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+//
+//    IMSME ims = new IMSME();
+//
+//    String test3 = "The summer is almost over and I haven't been to the 
beach even once";
+//    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+//    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
+//    List<String> tempLemmas3 = new ArrayList<String>();
+//    for (int i = 0; i < sentence3.length; i++) {
+//      String lemma = WSDHelper.getLemmatizer()
+//          .lemmatize(sentence3[i], tags3[i]);
+//      tempLemmas3.add(lemma);
+//    }
+//    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+//
+//    // output
+//    List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
+//    for (int i = 0; i < sentence3.length; i++) {
+//      System.out.print(sentence3[i] + " : ");
+//      WSDHelper.printResults(ims, senses3.get(i));
+//      WSDHelper.print("----------");
+//    }
 
   }
 }
\ No newline at end of file


Reply via email to