Author: joern
Date: Tue Aug 4 07:48:48 2015
New Revision: 1694008
URL: http://svn.apache.org/r1694008
Log:
OPENNLP-758 Updated Lesk with new data readers and added MFS in case no
overlaps are found (similar to the simplified version). Thanks to Anthony
Beylerian for providing a patch.
Added:
opennlp/sandbox/opennlp-wsd/.project (with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
(with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
(with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
(with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
(with props)
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
(with props)
Removed:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IParagraph.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/ISentence.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/IWord.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SemcorReaderExtended.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DatasetsReader/SensevalReader.java
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
Added: opennlp/sandbox/opennlp-wsd/.project
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/.project?rev=1694008&view=auto
==============================================================================
--- opennlp/sandbox/opennlp-wsd/.project (added)
+++ opennlp/sandbox/opennlp-wsd/.project Tue Aug 4 07:48:48 2015
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>opennlp-wsd</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ </buildSpec>
+ <natures>
+ </natures>
+</projectDescription>
Propchange: opennlp/sandbox/opennlp-wsd/.project
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Constants.java
Tue Aug 4 07:48:48 2015
@@ -138,33 +138,48 @@ public class Constants {
"you're", "yours", "yourself", "yourselves", "you've", "zero"));
// Print a text in the console
- public static void printResults(WSDisambiguator disambiguator,
- String[] results) {
+//Print a text in the console
+ public static void printResults(WSDisambiguator disambiguator,
+ String[] results) {
+
+ if (results != null) {
+
+ String[] parts;
+ String sensekey;
+ if (disambiguator instanceof Lesk) {
+
+ Double score;
+
+ for (String result : results) {
+ parts = result.split(" ");
+ sensekey = parts[1];
+ score = Double.parseDouble(parts[2]);
+ try {
+ Constants.print("score : "
+ + score
+ + " for : "
+ + Loader.getDictionary().getWordBySenseKey(sensekey)
+ .getSynset().getGloss());
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ } else {
+ for (String result : results) {
+ parts = result.split(" ");
+ sensekey = parts[1];
+ try {
+ Constants.print("sense : "
+ + Loader.getDictionary().getWordBySenseKey(sensekey)
+ .getSynset().getGloss());
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
- if (results != null) {
-
- if (disambiguator instanceof Lesk) {
- POS pos;
- long offset;
- double score;
- String[] parts;
-
- for (String result : results) {
- parts = result.split("@");
- pos = POS.getPOSForKey(parts[0]);
- offset = Long.parseLong(parts[1]);
- score = Double.parseDouble(parts[3]);
- try {
- Constants.print("score : " + score + " for : "
- + Loader.getDictionary().getSynsetAt(pos, offset).getGloss());
- } catch (JWNLException e) {
- e.printStackTrace();
- }
- }
- }
- }
-
- }
+ }
public static void print(Object in) {
if (in == null) {
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/Loader.java
Tue Aug 4 07:48:48 2015
@@ -30,7 +30,7 @@ import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.dictionary.Dictionary;
import net.sf.extjwnl.dictionary.MorphologicalProcessor;
import opennlp.tools.cmdline.postag.POSModelLoader;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.lemmatizer.SimpleLemmatizer;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java?rev=1694008&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
Tue Aug 4 07:48:48 2015
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.util.ArrayList;
+
+public class Paragraph {
+
+ protected int pnum;
+ protected ArrayList<Sentence> isentences;
+
+ public Paragraph() {
+ super();
+ this.isentences = new ArrayList<Sentence>();
+ }
+
+ public Paragraph(int pnum) {
+ super();
+ this.pnum = pnum;
+ this.isentences = new ArrayList<Sentence>();
+ }
+
+ public Paragraph(int pnum, ArrayList<Sentence> sentences) {
+ super();
+ this.pnum = pnum;
+ this.isentences = sentences;
+ }
+
+ public int getPnum() {
+ return pnum;
+ }
+
+ public void setPnum(int pnum) {
+ this.pnum = pnum;
+ }
+
+ public ArrayList<Sentence> getSsentences() {
+ return isentences;
+ }
+
+ public void setIsentences(ArrayList<Sentence> isentences) {
+ this.isentences = isentences;
+ }
+
+ public void addIsentence(Sentence isentence) {
+ this.isentences.add(isentence);
+ }
+
+ @Override
+ public String toString() {
+ String paragraph = "";
+ for (int i = 0; i < this.isentences.size(); i++) {
+ paragraph = paragraph + " " + this.isentences.get(i).toString();
+ }
+ return paragraph.substring(1, paragraph.length());
+
+ }
+
+ /**
+ * This return TRUE only and only if the paragraph contains the word and it
is
+ * sense-tagged
+ *
+ * @param wordTag
+ * @return {@value Boolean.true} if the word exists in the paragraph and is
+ * sense-tagged
+ *
+ */
+ public boolean contains(String wordTag) {
+
+ for (Sentence isentence : this.getSsentences()) {
+ for (Word iword : isentence.getIwords()) {
+ if (iword.equals(iword))
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+}
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Paragraph.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java?rev=1694008&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
Tue Aug 4 07:48:48 2015
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * This reads one semcor file. It requires the
+ *
+ */
+public class SemcorReaderExtended {
+
+ private static final String ELEMENT_CONTEXTFILE = "contextfile";
+ private static final String ATTRIBUTE_CONCORDANCE = "concordance";
+
+ private static final String ELEMENT_CONTEXT = "context";
+ private static final String ATTRIBUTE_FILENAME = "filename";
+ private static final String ATTRIBUTE_PARAS = "paras";
+
+ private static final String ELEMENT_PARAGRAPH = "p";
+ private static final String ATTRIBUTE_PARAGRAPHNUM = "pnum";
+
+ private static final String ELEMENT_SENTENCE = "s";
+ private static final String ATTRIBUTE_SENTENCENUM = "snum";
+
+ private static final String ELEMENT_WORDFORM = "wf";
+ private static final String ATTRIBUTE_CMD = "cmd";
+ private static final String ATTRIBUTE_RDF = "rdf";
+ private static final String ATTRIBUTE_POS = "pos";
+ private static final String ATTRIBUTE_LEMMA = "lemma";
+ private static final String ATTRIBUTE_WNSN = "wnsn";
+ private static final String ATTRIBUTE_LEXSN = "lexsn";
+
+ private static final String ELEMENT_PUNCTUATION = "punc";
+
+ private static String path = "src\\test\\resources\\semcor3.0\\";
+ private static String[] folders = { "brown1", "brown2", "brownv" };
+ private static String tagfiles = "\\tagfiles\\";
+
+ public SemcorReaderExtended() {
+ super();
+ }
+
+ /**
+ * This serves to read one Semcor XML file
+ */
+ public ArrayList<Sentence> readFile(String file) {
+
+ ArrayList<Sentence> result = new ArrayList<Sentence>();
+
+ try {
+
+ File xmlFile = new File(file);
+ DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+ Document doc = dBuilder.parse(xmlFile);
+
+ doc.getDocumentElement().normalize();
+
+ NodeList paragraphs = doc.getElementsByTagName(ELEMENT_PARAGRAPH);
+
+ for (int i = 0; i < paragraphs.getLength(); i++) {
+
+ Node nParagraph = paragraphs.item(i);
+
+ if (nParagraph.getNodeType() == Node.ELEMENT_NODE) {
+
+ Element eParagraph = (Element) nParagraph;
+ // THE PARAGRAPH ID
+ int paragraphID = Integer.parseInt(eParagraph
+ .getAttribute(ATTRIBUTE_PARAGRAPHNUM));
+
+ NodeList nSentences = nParagraph.getChildNodes();
+
+ for (int j = 1; j < nSentences.getLength(); j++) {
+
+ Node nSentence = nSentences.item(j);
+ if (nSentence.getNodeType() == Node.ELEMENT_NODE) {
+
+ Element eSentence = (Element) nSentence;
+ // THE SENTENCE ID
+ int sentenceID = Integer.parseInt(eSentence
+ .getAttribute(ATTRIBUTE_SENTENCENUM));
+ Sentence isentence = new Sentence(paragraphID, sentenceID);
+
+ NodeList nWords = nSentence.getChildNodes();
+
+ int wnum = 0;
+ for (int k = 0; k < nWords.getLength(); k++) {
+ Node nWord = nWords.item(k);
+
+ if (nWord.getNodeType() == Node.ELEMENT_NODE) {
+
+ if (nWord.getNodeName().equals(ELEMENT_WORDFORM)) {
+
+ Element eWord = (Element) nWord;
+
+ if (eWord.getAttribute(ATTRIBUTE_CMD).equals("done")) {
+ // if the word is already disambiguated
+ String word = eWord.getTextContent();
+ String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
+ String pos = eWord.getAttribute(ATTRIBUTE_POS);
+ String lemma = eWord.getAttribute(ATTRIBUTE_LEMMA);
+ String wnsn = eWord.getAttribute(ATTRIBUTE_WNSN);
+ String lexsn = eWord.getAttribute(ATTRIBUTE_LEXSN);
+
+ Word iword = new Word(paragraphID, sentenceID, wnum,
+ Word.Type.WORD, word, cmd, pos, lemma, wnsn, lexsn);
+ isentence.addIword(iword);
+ wnum++;
+
+ // System.out.println("*** " + iword.toString() + "
***");
+
+ } else {
+ // if the word is not disambiguated
+ String word = eWord.getTextContent();
+ String cmd = eWord.getAttribute(ATTRIBUTE_CMD);
+ String pos = eWord.getAttribute(ATTRIBUTE_POS);
+
+ Word iword = new Word(paragraphID, sentenceID, wnum,
+ Word.Type.WORD, word, cmd, pos);
+ isentence.addIword(iword);
+ wnum++;
+ }
+
+ } else if (nWord.getNodeName().equals(ELEMENT_PUNCTUATION)) {
+ Element eWord = (Element) nWord;
+ String word = eWord.getTextContent();
+ Word iword = new Word(paragraphID, sentenceID, wnum,
+ Word.Type.PUNCTUATIONMARK, word);
+ isentence.addIword(iword);
+ wnum++;
+ }
+
+ }
+
+ }
+ result.add(isentence);
+ }
+ }
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return result;
+ }
+
+ public ArrayList<WordToDisambiguate> getSemcorOneFileData(String file,
+ String wordTag) {
+
+ ArrayList<WordToDisambiguate> setInstances = new
ArrayList<WordToDisambiguate>();
+
+ try {
+
+ ArrayList<Sentence> isentences = readFile(file);
+ for (int j = 0; j < isentences.size(); j++) {
+ Sentence isentence = isentences.get(j);
+ ArrayList<Word> iwords = isentence.getIwords();
+ for (int k = 0; k < iwords.size(); k++) {
+ Word iword = iwords.get(k);
+ if (iword.isInstanceOf(wordTag)) {
+
+ String sentence;
+ int index;
+
+ if (j == 0) {
+ // case of the first sentence, we consider the current sentence
+ // and the next two ones
+ sentence = isentences.get(j).toString() + " "
+ + isentences.get(j + 1).toString() + " "
+ + isentences.get(j + 2).toString();
+ index = k;
+ } else if (j == isentences.size() - 1) {
+ // case of the last sentence, we consider the current sentence
and
+ // the previous two ones
+ sentence = isentences.get(j - 2).toString() + " "
+ + isentences.get(j - 1).toString() + " "
+ + isentences.get(j).toString();
+ index = isentences.get(j - 2).getIwords().size()
+ + isentences.get(j - 1).getIwords().size() + k;
+ } else {
+ // case of a sentence in the middle, we consider the previous
+ // sentence + the current one + the next one
+ sentence = isentences.get(j - 1).toString() + " "
+ + isentences.get(j).toString() + " "
+ + isentences.get(j + 1).toString();
+ index = isentences.get(j - 1).getIwords().size() + k;
+ }
+ ArrayList<String> senses = new ArrayList<String>();
+ String sense = iword.getLexsn();
+ if (sense != null) {
+ senses.add(sense);
+ }
+
+ if (!senses.isEmpty()) {
+ WordToDisambiguate wtd = new WordToDisambiguate(
+ sentence.split("\\s"), index, senses);
+ setInstances.add(wtd);
+ }
+
+ }
+ }
+
+ }
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return setInstances;
+
+ }
+
+ /**
+ * One Semcor folder reader: This reads all the files in one semcor folder,
+ * and return all the instances in the format {@link WordToDisambiguate} of a
+ * specific word
+ *
+ * @param folder
+ * the name of the folder. Three folders exist in Semcor3.0, which
+ * are ["brown1", "brown2", "brownv"]
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the list of the {@link WordToDisambiguate} instances
+ */
+ public ArrayList<WordToDisambiguate> getSemcorFolderData(String folder,
+ String wordTag) {
+
+ ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>();
+
+ String directory = path + folder + tagfiles;
+ File tempFolder = new File(directory);
+ File[] listOfFiles;
+
+ if (tempFolder.isDirectory()) {
+ listOfFiles = tempFolder.listFiles();
+ for (File file : listOfFiles) {
+
+ ArrayList<WordToDisambiguate> list = getSemcorOneFileData(directory
+ + file.getName(), wordTag);
+ result.addAll(list);
+ }
+ }
+
+ return result;
+
+ }
+
+ /**
+ * Semcor reader: This reads all the files in semcor, and return all the
+ * instances in the format {@link WordToDisambiguate} of a specific word
+ *
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the list of the {@link WordToDisambiguate} instances of the word
to
+ * disambiguate
+ */
+ public ArrayList<WordToDisambiguate> getSemcorData(String wordTag) {
+
+ ArrayList<WordToDisambiguate> result = new ArrayList<WordToDisambiguate>();
+
+ for (String folder : folders) {
+ ArrayList<WordToDisambiguate> list = getSemcorFolderData(folder,
wordTag);
+ result.addAll(list);
+ }
+
+ return result;
+
+ }
+
+}
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SemcorReaderExtended.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java?rev=1694008&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
Tue Aug 4 07:48:48 2015
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Collections;
+import java.util.Arrays;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import opennlp.tools.disambiguator.WordToDisambiguate;
+import opennlp.tools.disambiguator.ims.WTDIMS;
+
+/**
+ * This class handles the extraction of Senseval-3 data from the different
files
+ * (training data, dictionary instances, etc.)
+ */
+public class SensevalReader {
+
+ private String resourcesFolder = "src\\test\\resources\\";
+ protected String sensevalDirectory = resourcesFolder + "senseval3\\";
+
+ protected String data = sensevalDirectory + "EnglishLS.train";
+ protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
+ protected String wordList = sensevalDirectory + "EnglishLS.train.key";
+
+ // protected String dict = sensevalDirectory + "EnglishLS.dictionary.xml";
+ // protected String map = sensevalDirectory + "EnglishLS.sensemap";
+
+ /**
+ * The XML file of Senseval presents some issues that need to be fixed first
+ */
+ private String fixXmlFile() {
+
+ // TODO fix this !
+
+ return null;
+ }
+
+ public SensevalReader() {
+ super();
+ }
+
+ /**
+ * This extracts the equivalent senses. This serves in the case of the
+ * coarse-grained disambiguation
+ *
+ * @param sensemapFile
+ * the file containing the equivalent senses, each set of equivalent
+ * senses per line
+ * @return a {@link HashMap} conaining the new sense ID ({@link Integer}) and
+ * an {@link ArrayList} of the equivalent senses original IDs
+ */
+ public HashMap<Integer, ArrayList<String>> getEquivalentSense() {
+
+ HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer,
ArrayList<String>>();
+
+ try (BufferedReader wordsList = new BufferedReader(new FileReader(
+ sensemapFile))) {
+
+ int index = 0;
+
+ String line;
+
+ while ((line = wordsList.readLine()) != null) {
+
+ String[] temp = line.split("\\s");
+
+ ArrayList<String> tempSenses = new ArrayList<String>();
+
+ for (String sense : temp) {
+ if (sense.length() > 1) {
+ tempSenses.add(sense);
+ }
+ }
+
+ mappedSenses.put(index, tempSenses);
+ index++;
+
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ return mappedSenses;
+
+ }
+
+ /**
+ * This returns the list of words available in the Senseval data
+ *
+ * @return {@link ArrayList} of the words available on the current Senseval
+ * set
+ */
+ public ArrayList<String> getSensevalWords() {
+
+ ArrayList<String> wordTags = new ArrayList<String>();
+
+ try (BufferedReader br = new BufferedReader(new FileReader(wordList))) {
+
+ String line;
+
+ while ((line = br.readLine()) != null) {
+
+ String word = line.split("\\s")[0];
+
+ if (!wordTags.contains(word)) {
+ wordTags.add(word);
+ }
+
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ return wordTags;
+
+ }
+
+ /**
+ * Main Senseval Reader: This checks if the data corresponding to the words
to
+ * disambiguate exist in the folder, and extract the
+ * {@link WordToDisambiguate} instances
+ *
+ * @param wordTag
+ * The word, of which we are looking for the instances
+ * @return the list of the {@link WordToDisambiguate} instances of the word
to
+ * disambiguate
+ */
+ public ArrayList<WordToDisambiguate> getSensevalData(String wordTag) {
+
+ ArrayList<WordToDisambiguate> setInstances = new
ArrayList<WordToDisambiguate>();
+
+ try {
+
+ File xmlFile = new File(data);
+ DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+ Document doc = dBuilder.parse(xmlFile);
+
+ doc.getDocumentElement().normalize();
+
+ NodeList lexelts = doc.getElementsByTagName("lexelt");
+
+ for (int i = 0; i < lexelts.getLength(); i++) {
+
+ Node nLexelt = lexelts.item(i);
+
+ if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
+ Element eLexelt = (Element) nLexelt;
+
+ if (eLexelt.getAttribute("item").equals(wordTag)) {
+
+ NodeList nInstances = nLexelt.getChildNodes();
+
+ for (int j = 1; j < nInstances.getLength(); j++) {
+
+ Node nInstance = nInstances.item(j);
+
+ if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
+
+ Element eInstance = (Element) nInstance;
+
+ String[] wordPos = eLexelt.getAttribute("item").split("\\.");
+ String word = wordPos[0]; // Word
+ String tag; // Part of Speech
+
+ if (wordPos[1].equals("n")) {
+ tag = "noun";
+ } else if (wordPos[1].equals("v")) {
+ tag = "verb";
+ } else if (wordPos[1].equals("a")) {
+ tag = "adjective";
+ } else {
+ tag = "adverb";
+ }
+
+ String id = eInstance.getAttribute("id");
+ String source = eInstance.getAttribute("docsrc");
+
+ ArrayList<String> answers = new ArrayList<String>();
+ String sentence = "";
+ String rawWord = "";
+ String[] finalText = null;
+ int index = 0;
+
+ NodeList nChildren = nInstance.getChildNodes();
+
+ for (int k = 1; k < nChildren.getLength(); k++) {
+ Node nChild = nChildren.item(k);
+
+ if (nChild.getNodeName().equals("answer")) {
+ // String answer =
+ // nChild.getAttributes().item(0).getTextContent();
+ String senseid = nChild.getAttributes().item(1)
+ .getTextContent();
+
+ String temp = senseid;
+ // String[] temp = { answer, senseid };
+ answers.add(temp);
+ }
+
+ if (nChild.getNodeName().equals("context")) {
+ sentence = ((Element) nChild).getTextContent();
+
+ if (nChild.hasChildNodes()) {
+ String textBefore = nChild.getChildNodes().item(0)
+ .getTextContent();
+ rawWord =
nChild.getChildNodes().item(1).getTextContent();
+ String textAfter = nChild.getChildNodes().item(2)
+ .getTextContent();
+
+ ArrayList<String> textBeforeTokenzed = new
ArrayList<String>(
+ Arrays.asList(textBefore.split("\\s")));
+ ArrayList<String> textAfterTokenzed = new
ArrayList<String>(
+ Arrays.asList(textAfter.split("\\s")));
+
+
textBeforeTokenzed.removeAll(Collections.singleton(null));
+ textBeforeTokenzed.removeAll(Collections.singleton(""));
+
+ textAfterTokenzed.removeAll(Collections.singleton(null));
+ textAfterTokenzed.removeAll(Collections.singleton(""));
+
+ finalText = new String[textBeforeTokenzed.size() + 1
+ + textAfterTokenzed.size()];
+
+ int l = 0;
+ for (String tempWord : textBeforeTokenzed) {
+ finalText[l] = tempWord;
+ l++;
+ }
+ index = l;
+ finalText[l] = rawWord.toLowerCase();
+ l++;
+ for (String tempWord : textAfterTokenzed) {
+ finalText[l] = tempWord;
+ l++;
+ }
+
+ }
+ }
+
+ }
+
+ WTDIMS wordToDisambiguate = new WTDIMS(finalText, index,
+ answers);
+ setInstances.add(wordToDisambiguate);
+ }
+ }
+
+ }
+
+ }
+
+ }
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return setInstances;
+
+ }
+
+}
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java?rev=1694008&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
Tue Aug 4 07:48:48 2015
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.util.ArrayList;
+
+public class Sentence {
+
+ protected int pnum;
+ protected int snum;
+ protected ArrayList<Word> iwords;
+
+ public Sentence() {
+ super();
+ this.iwords = new ArrayList<Word>();
+ }
+
+ public Sentence(int pnum, int snum) {
+ super();
+ this.pnum = pnum;
+ this.snum = snum;
+ this.iwords = new ArrayList<Word>();
+ }
+
+ public Sentence(int pnum, int snum, ArrayList<Word> iwords) {
+ super();
+ this.pnum = pnum;
+ this.snum = snum;
+ this.iwords = iwords;
+ }
+
+ public int getPnum() {
+ return pnum;
+ }
+
+ public void setPnum(int pnum) {
+ this.pnum = pnum;
+ }
+
+ public int getSnum() {
+ return snum;
+ }
+
+ public void setSnum(int snum) {
+ this.snum = snum;
+ }
+
+ public ArrayList<Word> getIwords() {
+ return iwords;
+ }
+
+ public void setIwords(ArrayList<Word> iwords) {
+ this.iwords = iwords;
+ }
+
+ public void addIword(Word iword) {
+ this.iwords.add(iword);
+ }
+
+ @Override
+ public String toString() {
+ String sentence = "";
+ for (int i = 0; i < this.iwords.size(); i++) {
+ sentence = sentence + " " + this.iwords.get(i).toString();
+ }
+ return sentence.substring(1, sentence.length());
+
+ }
+
+}
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Sentence.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java?rev=1694008&view=auto
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
(added)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
Tue Aug 4 07:48:48 2015
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import opennlp.tools.disambiguator.Constants;
+
+public class Word {
+
+ public static enum Type {
+ WORD(1, "word"), PUNCTUATIONMARK(2, "pm");
+
+ public int code;
+ public String type;
+
+ private Type(int code, String type) {
+ this.code = code;
+ this.type = type;
+ }
+ }
+
+ protected int pnum;
+ protected int snum;
+ protected int wnum;
+
+ // Type refers to the type of word in the sentence
+ protected Type type;
+
+ protected String word;
+ protected String cmd;
+ protected String pos;
+ protected String lemma;
+ protected String wnsn;
+ protected String lexsn;
+
+ public Word() {
+ super();
+ }
+
+ public Word(String lemma, String pos) {
+ super();
+ this.word = lemma;
+ this.lemma = lemma;
+ this.pos = pos;
+ }
+
+ /**
+ * This serves to create a DISAMBIGUATED word instance
+ *
+ * @param pnum
+ * id of the paragraph
+ * @param snum
+ * id of the sentence
+ * @param wnum
+ * id of the word in the sentence
+ * @param type
+ * the type in this case is {@link Type.DWORD}
+ * @param word
+ * The raw word, as it appears in the sentence
+ * @param cmd
+ * Whether it is semantically disambiguated or not (or to be
+ * disambiguated)
+ * @param pos
+ * The PoS Tag of the word
+ * @param lemma
+ * The lemma of the word
+ * @param wnsn
+ * The integer sense number corresponding to the WordNet output
+ * display
+ * @param lexsn
+ * The "Sense_key" that indicates the WordNet sense to which word
+ * should be linked
+ *
+ */
+ public Word(int pnum, int snum, int wnum, Type type, String word,
+ String cmd, String pos, String lemma, String wnsn, String lexsn) {
+ super();
+ this.pnum = pnum;
+ this.snum = snum;
+ this.wnum = wnum;
+ this.type = type;
+ this.word = word;
+ this.cmd = cmd;
+ this.pos = pos;
+ this.lemma = lemma;
+ this.wnsn = wnsn;
+ this.lexsn = lexsn;
+ }
+
+ /**
+ * This serves to create a NON DISAMBIGUATED word instance
+ *
+ * @param pnum
+ * id of the paragraph
+ * @param snum
+ * id of the sentence
+ * @param type
+ * the type in this case is {@link Type.DWORD}
+ * @param word
+ * The raw word, as it appears in the sentence
+ * @param cmd
+ * Whether it is semantically disambiguated or not (or to be
+ * disambiguated)
+ * @param pos
+ * The PoS Tag of the word
+ *
+ */
+ public Word(int pnum, int snum, int wnum, Type type, String word,
+ String cmd, String pos) {
+ super();
+ this.wnum = wnum;
+ this.pnum = pnum;
+ this.snum = snum;
+ this.type = type;
+ this.word = word;
+ this.cmd = cmd;
+ this.pos = pos;
+ }
+
+ /**
+ * This serves to create a punctuation instances
+ *
+ * @param type
+ * The type as in {@link Type}
+ * @param word
+ * The punctuation mark, as it appears in the sentence
+ */
+ public Word(int pnum, int snum, int wnum, Type type, String word) {
+ super();
+ this.pnum = pnum;
+ this.snum = snum;
+ this.type = type;
+ this.word = word;
+ }
+
+ public int getPnum() {
+ return pnum;
+ }
+
+ public void setPnum(int pnum) {
+ this.pnum = pnum;
+ }
+
+ public int getSnum() {
+ return snum;
+ }
+
+ public void setSnum(int snum) {
+ this.snum = snum;
+ }
+
+ public int getWnum() {
+ return wnum;
+ }
+
+ public void setWnum(int wnum) {
+ this.wnum = wnum;
+ }
+
+ public String getWord() {
+ return word;
+ }
+
+ public void setWord(String word) {
+ this.word = word;
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ public void setType(Type type) {
+ this.type = type;
+ }
+
+ public String getCmd() {
+ return cmd;
+ }
+
+ public void setCmd(String cmd) {
+ this.cmd = cmd;
+ }
+
+ public String getPos() {
+ return pos;
+ }
+
+ public void setPos(String pos) {
+ this.pos = pos;
+ }
+
+ public String getLemma() {
+ return lemma;
+ }
+
+ public void setLemma(String lemma) {
+ this.lemma = lemma;
+ }
+
+ public String getWnsn() {
+ return wnsn;
+ }
+
+ public void setWnsn(String wnsn) {
+ this.wnsn = wnsn;
+ }
+
+ public String getLexsn() {
+ return lexsn;
+ }
+
+ public void setLexsn(String lexsn) {
+ this.lexsn = lexsn;
+ }
+
+ @Override
+ public String toString() {
+ return this.word;
+ }
+
+ public boolean equals(Object oword) {
+
+ if (!(oword instanceof Word))
+ return false;
+ if (oword == this)
+ return true;
+
+ Word iword = (Word) oword;
+
+ if (this.lemma != null && iword.getLemma() != null) {
+ if (iword.getLemma().equals(this.getLemma())
+ && Constants.getPOS(iword.getPos()).equals(
+ Constants.getPOS(this.getPos()))) {
+ return true;
+ }
+ } else {
+ if (this.word.equals(iword.getWord())
+ && Constants.getPOSabbreviation(this.getPos()).equals(
+ Constants.getPOSabbreviation(iword.getPos()))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public boolean isInstanceOf(String wordTag) {
+
+ String tag = Constants.getPOSabbreviation(this.getPos());
+
+ String oword = wordTag.split("\\.")[0];
+ String otag = wordTag.split("\\.")[1];
+
+ if (this.lemma != null) {
+ if (this.lemma.equals(oword) && tag.equals(otag)) {
+ if (this.lexsn != null) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ public boolean senseEquals(Object oword) {
+
+ if (!(oword instanceof Word))
+ return false;
+ if (oword == this)
+ return true;
+
+ Word iword = (Word) oword;
+
+ if (iword.getLemma().equals(this.getLemma())
+ && Constants.getPOS(iword.getPos()).equals(
+ Constants.getPOS(this.getPos()))
+ && iword.getLexsn().equals(this.getLexsn())) {
+ return true;
+ }
+
+ return false;
+ }
+
+}
Propchange:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/Word.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMS.java
Tue Aug 4 07:48:48 2015
@@ -58,8 +58,8 @@ import opennlp.tools.disambiguator.WSDPa
import opennlp.tools.disambiguator.WordPOS;
import opennlp.tools.disambiguator.WSDisambiguator;
import opennlp.tools.disambiguator.WordToDisambiguate;
-import opennlp.tools.disambiguator.DatasetsReader.SemcorReaderExtended;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
/**
* Implementation of the <b>It Makes Sense</b> approach originally proposed in
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/Lesk.java
Tue Aug 4 07:48:48 2015
@@ -33,6 +33,7 @@ import opennlp.tools.disambiguator.WordP
import opennlp.tools.disambiguator.WordSense;
import opennlp.tools.util.Span;
import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Synset;
import net.sf.extjwnl.data.Word;
@@ -95,6 +96,36 @@ public class Lesk implements WSDisambigu
return params;
}
+ /*
+ * @return the most frequent senses from wordnet
+ */
+ protected String getMostFrequentSenseKey(WTDLesk wtd) {
+
+ String word = wtd.getRawWord().toLowerCase();
+ POS pos = Constants.getPOS(wtd.getPosTag());
+ String senseKey = null;
+
+ if (pos != null) {
+
+ WordPOS wordPOS = new WordPOS(word, pos);
+
+ ArrayList<Synset> synsets = wordPOS.getSynsets();
+
+ for (Word wd : synsets.get(0).getWords()) {
+ if (wd.getLemma().equals(wtd.getRawWord().split("\\.")[0])) {
+ try {
+ senseKey = wd.getSenseKey();
+ break;
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ break;
+ }
+ }
+ }
+ return senseKey;
+ }
+
/**
* The basic Lesk method where the entire context is considered for overlaps
*
@@ -980,23 +1011,30 @@ public class Lesk implements WSDisambigu
Collections.sort(wsenses);
- List<Word> synsetWords;
- String[] senses = new String[wsenses.size()];
- String senseKey = "?";
- for (int i = 0; i < wsenses.size(); i++) {
- synsetWords = wsenses.get(i).getNode().synset.getWords();
- for (Word synWord : synsetWords) {
- if (synWord.getLemma().equals(wtd.getWord())) {
- try {
- senseKey = synWord.getSenseKey();
- } catch (JWNLException e) {
- e.printStackTrace();
+ String[] senses;
+ if (wsenses.get(0).getScore() > 0) { // if at least one overlap
+ List<Word> synsetWords;
+ senses = new String[wsenses.size()];
+ String senseKey = "?";
+ for (int i = 0; i < wsenses.size(); i++) {
+ synsetWords = wsenses.get(i).getNode().synset.getWords();
+ for (Word synWord : synsetWords) {
+ if (synWord.getLemma().equals(wtd.getWord())) {
+ try {
+ senseKey = synWord.getSenseKey();
+ } catch (JWNLException e) {
+ e.printStackTrace();
+ }
+ break;
}
- break;
}
- }
- senses[i] = "WordNet" + " " + senseKey + " " + wsenses.get(i).getScore();
+ senses[i] = "WordNet" + " " + senseKey + " "
+ + wsenses.get(i).getScore();
+ }
+ } else { // get the MFS if no overlaps
+ senses = new String[1];
+ senses[0] = "WordNet" + " " + this.getMostFrequentSenseKey(wtd) + " -1";
}
return senses;
}
Modified:
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/lesk/LeskParameters.java
Tue Aug 4 07:48:48 2015
@@ -40,10 +40,10 @@ public class LeskParameters extends WSDP
// DEFAULTS
protected static final LESK_TYPE DFLT_LESK_TYPE =
LESK_TYPE.LESK_EXT_EXP_CTXT_WIN;
- protected static final int DFLT_WIN_SIZE = 4;
- protected static final int DFLT_DEPTH = 3;
- protected static final double DFLT_IEXP = 0.3;
- protected static final double DFLT_DEXP = 0.3;
+ protected static final int DFLT_WIN_SIZE = 5;
+ protected static final int DFLT_DEPTH = 2;
+ protected static final double DFLT_IEXP = 0.4;
+ protected static final double DFLT_DEXP = 0.4;
protected LESK_TYPE leskType;
protected int win_f_size;
Modified:
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
Tue Aug 4 07:48:48 2015
@@ -22,7 +22,7 @@ package opennlp.tools.disambiguator;
import java.io.File;
import java.util.ArrayList;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.disambiguator.ims.IMS;
import opennlp.tools.disambiguator.ims.IMSParameters;
import opennlp.tools.disambiguator.ims.WTDIMS;
Modified:
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/LeskEvaluatorTest.java
Tue Aug 4 07:48:48 2015
@@ -19,10 +19,9 @@
package opennlp.tools.disambiguator;
-import java.io.File;
import java.util.ArrayList;
-import java.util.HashMap;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.disambiguator.ims.WTDIMS;
import opennlp.tools.disambiguator.lesk.Lesk;
import opennlp.tools.disambiguator.lesk.LeskParameters;
@@ -31,80 +30,58 @@ import org.junit.Test;
public class LeskEvaluatorTest {
- static DataExtractor dExtractor = new DataExtractor();
+ static SensevalReader seReader = new SensevalReader();
@Test
public static void main(String[] args) {
Constants.print("Evaluation Started");
- String testDataLoc = "src\\test\\resources\\data\\";
- String helpersLoc = "src\\test\\resources\\helpers\\";
-
- File[] listOfFiles;
- File testFolder = new File(testDataLoc);
-
- // these are needed for mapping the sense IDs from the current data
- String dict = helpersLoc + "EnglishLS.dictionary.xml";
- String map = helpersLoc + "EnglishLS.sensemap";
-
Lesk lesk = new Lesk();
LeskParameters leskParams = new LeskParameters();
leskParams.setLeskType(LeskParameters.LESK_TYPE.LESK_EXT_EXP_CTXT_WIN);
lesk.setParams(leskParams);
- if (testFolder.isDirectory()) {
- listOfFiles = testFolder.listFiles();
- for (File file : listOfFiles) {
- WSDEvaluator evaluator = new WSDEvaluator(lesk);
- if (file.isFile()) {
- // don't take verbs because they are not from WordNet
- if (!file.getName().split("\\.")[1].equals("v")) {
- HashMap<String, ArrayList<DictionaryInstance>> senses = dExtractor
- .extractWordSenses(dict, map, file.getName());
- ArrayList<WTDIMS> instances = getTestData(file.getAbsolutePath(),
- senses);
-
- if (instances != null) {
- Constants.print("------------------" + file.getName()
- + "------------------");
- for (WordToDisambiguate instance : instances) {
- // Constants.print("sense IDs : " + instance.senseIDs);
- evaluator.evaluateSample(instance);
- }
- Constants.print(evaluator.toString());
- } else {
- Constants.print("null instances");
+ ArrayList<String> words = seReader.getSensevalWords();
+
+ for (String word : words) {
+ WSDEvaluator evaluator = new WSDEvaluator(lesk);
+
+ // don't take verbs because they are not from WordNet
+ if (!word.split("\\.")[1].equals("v")) {
+
+ ArrayList<WTDIMS> instances = getTestData(word);
+
+ if (instances != null) {
+ Constants.print("------------------" + word + "------------------");
+ for (WordToDisambiguate instance : instances) {
+
+ if (instance.getSenseIDs() != null
+ && !instance.getSenseIDs().get(0).equals("null")) {
+ evaluator.evaluateSample(instance);
}
}
+ Constants.print(evaluator.toString());
+ } else {
+ Constants.print("null instances");
}
}
+
}
}
- protected static ArrayList<WTDIMS> getTestData(String testFile,
- HashMap<String, ArrayList<DictionaryInstance>> senses) {
- /**
- * word tag has to be in the format "word.POS" (e.g., "activate.v",
- * "smart.a", etc.)
- */
- ArrayList<WTDIMS> trainingData = dExtractor.extractWSDInstances(testFile);
-
- // HashMap<Integer, WTDIMS> trainingData =
- // dExtractor.extractWSDInstances(wordTrainingxmlFile);
- for (WTDIMS data : trainingData) {
- for (String senseId : data.getSenseIDs()) {
- for (String dictKey : senses.keySet()) {
- for (DictionaryInstance instance : senses.get(dictKey)) {
- if (senseId.equals(instance.getId())) {
- data.setSense(Integer.parseInt(dictKey.split("_")[1]));
- break;
- }
- }
+ protected static ArrayList<WTDIMS> getTestData(String wordTag) {
+
+ ArrayList<WTDIMS> instances = new ArrayList<WTDIMS>();
+ for (WordToDisambiguate wtd : seReader.getSensevalData(wordTag)) {
+ WTDIMS wtdims = new WTDIMS(wtd);
+ if (wtdims != null) {
+ if (wtdims.getSenseIDs().get(0) != null
+ && !wtdims.getSenseIDs().get(0).equalsIgnoreCase("U")) {
+ instances.add(wtdims);
}
}
}
-
- return trainingData;
+ return instances;
}
}
Modified:
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java?rev=1694008&r1=1694007&r2=1694008&view=diff
==============================================================================
---
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
(original)
+++
opennlp/sandbox/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/MFSEvaluatorTest.java
Tue Aug 4 07:48:48 2015
@@ -21,7 +21,7 @@ package opennlp.tools.disambiguator;
import java.util.ArrayList;
-import opennlp.tools.disambiguator.DatasetsReader.SensevalReader;
+import opennlp.tools.disambiguator.datareader.SensevalReader;
import opennlp.tools.disambiguator.ims.WTDIMS;
import opennlp.tools.disambiguator.mfs.MFS;
import opennlp.tools.disambiguator.mfs.MFSParameters;