[07/11] opennlp-sandbox git commit: removed stanford nlp refs

bgalitsky Tue, 22 Nov 2016 05:06:12 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
deleted file mode 100644
index 0f53ec5..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
-import org.apache.commons.lang3.StringUtils;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.similarity.apps.BingQueryRunner;
-import opennlp.tools.similarity.apps.HitBase;
-import opennlp.tools.similarity.apps.utils.ValueSortMap;
-import opennlp.tools.stemmer.PStemmer;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-public class LinguisticPhraseManager {
-       private Map<String, Integer> freq = new ConcurrentHashMap<String, 
Integer>();
-       
-       // the purpose to init this static object is to show the path to 
resources
-       private static StopList stop = StopList.getInstance(new 
File(".").getAbsolutePath().replace(".","")+ "src/test/resources/");
-
-       // this list will be overwritten by the external synonyms.csv
-       private static String[][] synonymPairs = new String[][]{};
-       private PStemmer stemmer = new PStemmer();
-
-       private List<ParseTreeChunk> lingPhrases = new 
ArrayList<ParseTreeChunk>();
-       private List<String> standardizedTopics = new ArrayList<String>();
-       // map which shows for each ling phrase the list of ling phrases with 
the same head noun it belongs
-       private Map<ParseTreeChunk, List<ParseTreeChunk>> entry_group = new 
ConcurrentHashMap<ParseTreeChunk, List<ParseTreeChunk>>();
-
-       //  map which shows for each string phrase the list of ling phrases 
with the same head noun it belongs
-       private Map<String, List<ParseTreeChunk>> std_group = new 
ConcurrentHashMap<String, List<ParseTreeChunk>>();
-
-       private BingQueryRunner runner = new BingQueryRunner();
-       private static final int MIN_NUMBER_OF_PHRASES_TO_CONSIDER = 3;//2; 5
-       private static final int MIN_LENGTH_OF_WORD_TO_CONSIDER = 3;
-       // this function takes a log of a chain of the nodes of parse trees and 
builds their instances
-       // the phrases should only be VP or NP, otherwise an exception should 
be thrown
-       
-       
-
-       private String resourceDir;
-       public LinguisticPhraseManager(){
-               try {
-                       resourceDir  = new File( "." 
).getCanonicalPath()+"/src/main/resources/";
-                       List<String[]> vocabs = 
ProfileReaderWriter.readProfiles(resourceDir+"/synonyms.csv");
-                       synonymPairs = new String[vocabs.size()][2];
-                       int count = 0;
-                       for(String[] line: vocabs){
-                               try {
-                                       synonymPairs[count] = line;
-                                       count++;
-                   } catch (Exception e) {
-                       e.printStackTrace();
-                   }
-                       }
-                       
-               } catch (Exception e) {
-                       e.printStackTrace();
-               }               
-       }
-
-       private ParseTreeChunk parseLingPhraseIntoParseTreeChunk(String phrStr){
-               ParseTreeChunk ch = new ParseTreeChunk();
-               List<String> POSs = new ArrayList<String>(), lemmas = new 
ArrayList<String>();
-
-               String[] parts = phrStr.replace("]","").split(", <");
-
-               ch.setMainPOS( StringUtils.substringBetween(phrStr, ">", "'"));
-               try {
-                       for(String part: parts){
-                               String lemma = 
StringUtils.substringBetween(part, "P'", "':").toLowerCase();
-                               String pos = 
part.substring(part.indexOf(":")+1, part.length());
-
-                               if (pos==null || lemma ==null){
-                                       continue;
-                               }
-                               POSs.add(pos.trim());
-                               lemmas.add(lemma.trim());
-                               ch.setPOSs(POSs); ch.setLemmas(lemmas);
-                       }
-               } catch (Exception e) {
-                       // we expect exceptions if extracted phrases are 
NEITHER NP nor VP
-                       // empty chunk will be given which will not create a 
new topic
-                       e.printStackTrace();
-               }
-
-               return ch;
-       }
-
-       // this is a constructor with an array of extraction files
-       // optimized for performance
-       // only topics occurring more than MIN_NUMBER_OF_PHRASES_TO_CONSIDER 
times will be considered
-       public LinguisticPhraseManager(String[] loadPaths){
-               List<String[]> columns = new ArrayList<String[]>();
-               for(String file: loadPaths){
-                       columns.addAll(ProfileReaderWriter.readProfiles( file));
-               }
-
-               for(String[] l: columns){
-                       if (l.length<3 || l[1]==null || l[2]==null)
-                               continue;
-                       String word = l[1].toLowerCase().trim();
-                       if (word.indexOf("=>")>-1)
-                               continue;
-
-                       word = isAcceptableStringPhrase(word);
-                       if (word==null)
-                               continue;
-
-                       if (!freq.containsKey(word)) {
-                               freq.put(word, 1);
-
-                       } else {
-                               freq.put(word, freq.get(word) + 1);
-                               // once we reached the count for a topic, 
create it
-                               if 
(freq.get(word)==MIN_NUMBER_OF_PHRASES_TO_CONSIDER){
-                                       ParseTreeChunk ch = 
parseLingPhraseIntoParseTreeChunk(l[2]);
-                                       ch = isAcceptableLingPhrase(ch);
-                                       if (ch==null)
-                                               continue;
-                                       lingPhrases.add(ch);
-                               }
-                       }                 
-               }
-               // we dont need frequency data any more
-               freq.clear();
-       }
-
-       // this is a default constructor with a single topic extraction file
-       // not optimized for performance
-       public LinguisticPhraseManager(String loadPath){
-               List<String[]> columns = ProfileReaderWriter.readProfiles( 
loadPath);
-               for(String[] l: columns){
-                       if (l.length<3 || l[1]==null || l[2]==null)
-                               continue;
-                       String word = l[1].toLowerCase().trim();
-                       if (word.indexOf("=>")>-1)
-                               continue;
-
-                       word = isAcceptableStringPhrase(word);
-                       if (word==null)
-                               continue;
-
-                       if (!freq.containsKey(word)) {
-
-                               ParseTreeChunk ch = 
parseLingPhraseIntoParseTreeChunk(l[2]);
-                               ch = isAcceptableLingPhrase(ch);
-                               if (ch==null)
-                                       continue;
-                               freq.put(word, 1);
-                               lingPhrases.add(ch);
-                       } else {
-                               freq.put(word, freq.get(word) + 1);
-                       }                 
-
-
-               }
-               freq = ValueSortMap.sortMapByValue(freq, false);
-
-
-       }
-       // removing prepositions and articles in case it has not worked at 
phrase forming stage
-       private String isAcceptableStringPhrase(String word) {
-               if (word.startsWith("to "))
-                       return null;
-               if (word.startsWith("a "))
-                       return word.substring(2, word.length());
-
-               if (word.endsWith(" !") || word.endsWith(" ."))
-                       return word.substring(0, word.length()-2).trim();
-
-               return word;
-       }
-       // we only accept NP 
-       private ParseTreeChunk isAcceptableLingPhrase(ParseTreeChunk ch) {
-               if (!ch.getMainPOS().equals("NP"))
-                       return null;
-
-
-               return ch;
-       }
-
-       // groups are sets of phrases with the same head noun
-       // put all phrases in a group. Have a map from each phrase to its 
group: the list of members
-       public void doLingGrouping(){
-               for(int i=0; i< lingPhrases.size(); i++){
-                       for(int j=i+1; j< lingPhrases.size(); j++){
-                               ParseTreeChunk chI = lingPhrases.get(i);
-                               ParseTreeChunk chJ = lingPhrases.get(j);
-                               if 
(chI.getLemmas().get(chI.getLemmas().size()-1).equals(chJ.getLemmas().get(chJ.getLemmas().size()-1))
-                                               && 
chI.getPOSs().get(chI.getLemmas().size()-1).startsWith("NN") ){
-                                       List<ParseTreeChunk> values = null; 
-                                       if( 
chI.getLemmas().size()<chJ.getLemmas().size()){             
-
-                                               if (values == null)
-                                                       values = new 
ArrayList<ParseTreeChunk>();
-                                               values.add(chI);
-                                               entry_group.put(chJ, values);
-                                       } else {
-                                               values = entry_group.get(chI);
-                                               if (values == null)
-                                                       values = new 
ArrayList<ParseTreeChunk>();
-                                               values.add(chJ);
-                                               entry_group.put(chI, values);
-                                       }
-                               }
-                       }
-               }
-
-
-       }
-
-       public List<String> formStandardizedTopic(){
-               Set<ParseTreeChunk> keys = entry_group.keySet();
-               for(ParseTreeChunk k: keys){
-                       List<ParseTreeChunk> lingPhrases = entry_group.get(k);  
        
-                       for(int i=0; i< lingPhrases.size(); i++)
-                               for(int j=i+1; j< lingPhrases.size(); j++){
-                                       ParseTreeChunk chI = lingPhrases.get(i);
-                                       ParseTreeChunk chJ = lingPhrases.get(j);
-                                       List<String> lemmas = new 
ArrayList<String>(chI.getLemmas());
-                                       lemmas.retainAll(chJ.getLemmas());
-                                       if (lemmas.size()<2)
-                                               continue;
-                                       String buf = ""; List<String> 
candTopicLst = new ArrayList<String>();
-                                       for(String w: lemmas){
-                                               if 
(w.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER)
-                                                       continue;
-                                               if (!StringUtils.isAlpha(w))
-                                                       continue;
-                                               // find POS of w
-                                               boolean bAccept = false;
-                                               for(int iw=0; 
iw<chI.getLemmas().size(); iw++){
-                                                       if 
(w.equals(chI.getLemmas().get(iw))){
-                                                               if 
(chI.getPOSs().get(iw).startsWith("NN") || 
chI.getPOSs().get(iw).startsWith("JJ")
-                                                                               
|| chI.getPOSs().get(iw).startsWith("VB"))
-                                                                       
bAccept=true;
-                                                       }
-                                               }
-                                               if (bAccept){
-                                                       //buf+=w+" ";
-                                                       String ws = 
substituteSynonym(w);
-                                                       candTopicLst.add(ws);
-                                               }
-                                       }
-                                       // remove duplicates like 'new new 
house'
-                                       //candTopicLst = new 
ArrayList<String>(new HashSet<String>(candTopicLst));
-                                       for(String w: candTopicLst){
-                                               buf+=w+" ";
-                                       }
-
-                                       buf = buf.trim();
-                                       if (buf.indexOf(' ')<0)
-                                               continue;
-
-                                       if (!standardizedTopics.contains(buf)){
-                                               standardizedTopics.add(buf);    
        
-                                               std_group.put(buf, lingPhrases);
-                                       }
-                               }
-               }
-               cleanUpStandardizedTopics();
-
-               return standardizedTopics;
-       }
-
-       public void cleanUpStandardizedTopics(){
-               List<String> toDelete = new ArrayList<String>();
-               for(int i=0; i< standardizedTopics.size(); i++)
-                       for(int j=i+1; j< standardizedTopics.size(); j++){
-                               List<String> t1 = 
TextProcessor.fastTokenize(standardizedTopics.get(i), false);
-                               List<String> t2 = 
TextProcessor.fastTokenize(standardizedTopics.get(j), false);
-                               for(int k=0; k< t1.size(); k++){
-                                       t1.set(k, stemmer.stem(t1.get(k)));
-                               }
-                               for(int k=0; k< t2.size(); k++){
-                                       t2.set(k, stemmer.stem(t2.get(k)));
-                               } 
-                               // check if lists are equal
-                               if (t1.size()!=t2.size())
-                                       continue;
-                               //if in two phrases once all keywords are 
tokenized, one phrase annihilates another, 
-                               t1.removeAll(t2);
-                               if (t1.isEmpty()){ 
-                                       if (standardizedTopics.get(i).length()> 
standardizedTopics.get(j).length()){
-                                               
toDelete.add(standardizedTopics.get(i));
-                                               // TODO update std_group entry
-                                               System.out.println("Removing '" 
+ standardizedTopics.get(i) + "' because of '" + standardizedTopics.get(j) );
-                                               List<ParseTreeChunk> stJ = 
std_group.get(standardizedTopics.get(j));
-                                               
stJ.addAll(std_group.get(standardizedTopics.get(i)));
-                                               stJ = new 
ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stJ));
-                                               
std_group.put(standardizedTopics.get(j), stJ);
-                                       }
-                                       else {
-                                               
toDelete.add(standardizedTopics.get(j));
-                                               System.out.println("Removing '" 
+ standardizedTopics.get(j) + "' because of '" + standardizedTopics.get(i) );
-                                               List<ParseTreeChunk> stI = 
std_group.get(standardizedTopics.get(i));
-                                               
stI.addAll(std_group.get(standardizedTopics.get(j)));
-                                               stI = new 
ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stI));
-                                               
std_group.put(standardizedTopics.get(i), stI);
-                                       }
-
-                               }
-                       }
-               for(String d: toDelete){
-                       //System.out.println("Removed '" + d + "'");
-                       standardizedTopics.remove(d);
-               }
-       }
-
-       // substitute synonyms according to internal vocab
-       private String substituteSynonym(String w) {
-               try {
-                       for(String[] pair: synonymPairs){
-                               if (w.equals(pair[0]))
-                                       return pair[1];
-                       }
-               } catch (Exception e) {
-                       e.printStackTrace();
-               }
-               return w;
-       }
-
-       public void generateGroupingReport(String reportName){
-               List<String[]>  report = new ArrayList<String[]>();
-               Set<ParseTreeChunk> chs = entry_group.keySet();
-               report.add(new String[]{"string phrase" , "class", "linguistic 
phrase",  "list of ling phrases class representatives"});
-
-               for(ParseTreeChunk ch: chs){
-                       String head = 
ch.getLemmas().get(ch.getLemmas().size()-1);
-                       List<ParseTreeChunk> values = entry_group.get(ch);
-                       if (values.size()<6)
-                               head = "";
-                       report.add(new String[]{ch.toWordOnlyString(), head,  
ch.toString(),  values.toString()});
-               }
-               ProfileReaderWriter.writeReport(report, reportName);
-       }
-
-       //final merge floor-floors-flooring as head nound with phrase update
-       public void applyLastRoundOfAggregation(){
-               //merge <floor - floors - flooring>
-               /*
-                       List<ParseTreeChunk> entries =  new 
ArrayList<ParseTreeChunk>(entry_group.keySet());
-                       for(int i=0; i< entries.size(); i++){
-                               for(int j=i+1; j< entries.size(); j++){
-                                       ParseTreeChunk chI = entries.get(i);
-                                       ParseTreeChunk chJ = entries.get(j);
-                                       String headI = 
getLastElement(chI.getLemmas());
-                                       String headJ = 
getLastElement(chJ.getLemmas());
-                                       if (headI==null || 
headI.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER  || 
-                                                       headJ==null || 
headJ.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER )
-                                               continue;
-
-                                       if (headI.indexOf(headJ)>-1){
-                                               //leave headJ
-                                               List<ParseTreeChunk> 
valuesToAddTo = entry_group.get(chJ);
-                                               List<ParseTreeChunk> 
valuesBeingAdded = entry_group.get(chI);
-                                               if (valuesToAddTo==null || 
valuesBeingAdded == null)
-                                                       continue;
-                                               
valuesToAddTo.addAll(valuesBeingAdded);
-                                               entry_group.put(chJ, 
valuesToAddTo);
-                                               entry_group.remove(chI);
-                                               System.out.println("Deleting 
entry '"+ headI +"' and moving group to entry '"+ headJ +"'");
-                                       } else if (headJ.indexOf(headI)>-1){
-                                               //leave headJ
-                                               List<ParseTreeChunk> 
valuesToAddTo = entry_group.get(chI);
-                                               List<ParseTreeChunk> 
valuesBeingAdded = entry_group.get(chJ);
-                                               if (valuesToAddTo==null || 
valuesBeingAdded == null)
-                                                       continue;
-                                               
valuesToAddTo.addAll(valuesBeingAdded);
-                                               entry_group.put(chI, 
valuesToAddTo);
-                                               entry_group.remove(chJ);
-                                               System.out.println("Deleting 
entry '"+ headJ +"' and moving group to entry '"+ headI +"'");
-                                       }
-
-                               }
-                       }
-                */
-               for(int i = 0; i<standardizedTopics.size(); i++ )
-                       for(int j = i+1; j<standardizedTopics.size(); j++ ){
-                               String headI = 
extractHeadNounFromPhrase(standardizedTopics.get(i));
-                               String headJ = 
extractHeadNounFromPhrase(standardizedTopics.get(j));
-                               // if the same word do nothing
-                               if (headI.equals(headJ))
-                                       continue;
-
-                               //only if one is sub-word of another
-                               if (headI.indexOf(headJ)>-1){
-
-                                       if (!properSubWordForm(headI, headJ))
-                                               continue;
-                                       //entry 'I' will be updated
-                                       String newKey = 
standardizedTopics.get(i).replace(headI, headJ);
-
-                                       List<ParseTreeChunk> stI = 
std_group.get(standardizedTopics.get(i));
-                                       List<ParseTreeChunk> stInew = 
std_group.get(newKey);
-                                       //if (stInew!=null && !stInew.isEmpty())
-                                       //      stI.addAll(stInew);
-                                       if(stI==null)
-                                               continue;
-                                       std_group.put(newKey, stI);
-                                       
std_group.remove(standardizedTopics.get(i));
-                                       System.out.println("Deleted entry for 
key '"+ standardizedTopics.get(i) +"' and created  '"+ newKey +"'");
-                                       standardizedTopics.set(i, newKey);
-
-                               } else if (headJ.indexOf(headI)>-1){
-                                       if (!properSubWordForm(headJ, headI))
-                                               continue;
-                                       //entry 'J' will be updated
-                                       String newKey = 
standardizedTopics.get(j).replace(headJ, headI);
-
-                                       List<ParseTreeChunk> stJ = 
std_group.get(standardizedTopics.get(j));
-                                       List<ParseTreeChunk> stJnew = 
std_group.get(newKey);
-                                       //if (stJnew!=null && !stJnew.isEmpty())
-                                       //      stJ.addAll(stJnew);
-                                       if(stJ==null)
-                                               continue;
-                                       std_group.put(newKey, stJ);
-                                       
std_group.remove(standardizedTopics.get(j));
-                                       System.out.println("Deleted entry for 
key '"+ standardizedTopics.get(j) +"' and created  '"+ newKey +"'");
-                                       standardizedTopics.set(j, newKey);
-                               }
-                       }
-
-
-
-       }
-
-       private boolean properSubWordForm(String headI, String headJ) {
-               String suffix = headI.replace(headJ, "");
-               if (suffix.equals("s") || suffix.equals("ing") //|| 
suffix.equals("er") 
-                               || suffix.equals("rooms") ||
-                               suffix.equals("") || suffix.equals("counter") ||
-                               suffix.equals("room") || suffix.equals("back"))
-                       return true;
-
-               //System.out.println("Wrong word '"+ headI + "'reduction into 
'" + headJ +"'");
-               return false;
-       }
-
-       //generates report 
-       public void generateStdTopicReport(String reportName){
-               List<String[]>  report = new ArrayList<String[]>();
-               report.add(new String[]{"category", "topic", "sub-topics", 
"phrase instances" });
-
-               for(String t: standardizedTopics){
-
-                       String bufCover = "";
-                       int count = 0;
-                       List<ParseTreeChunk> ptcList = std_group.get(t);
-                       if (ptcList == null)
-                               continue;
-                       for(ParseTreeChunk ch: ptcList){
-                               List<String> candidate = 
TextProcessor.fastTokenize(ch.toWordOnlyString(), false);
-                               List<String> tList = 
TextProcessor.fastTokenize(t, false);
-                               List<String> tListChk = new 
ArrayList<String>(tList);
-
-                               tListChk.removeAll(candidate);
-                               // fully covered by phrase instance
-                               if (!tListChk.isEmpty() || 
ch.toWordOnlyString().equals(t)){
-                                       continue;
-                               }
-
-                               boolean bCovered = true;
-                               
-                               for(String ts: tList){
-                                       boolean bCandWordsIsCovered = false;
-                                       for(String s: candidate){
-                                               if ((s.indexOf(ts)>-1) )//  && 
properSubWordForm(s, ts))
-                                                       bCandWordsIsCovered = 
true;
-                                       }
-                                       if (!bCandWordsIsCovered){
-                                               bCovered = false;
-                                               break;
-                                       }
-                               }
-                               if (!bCovered)
-                                       continue;
-                               bufCover+=ch.toWordOnlyString()+ " # ";
-                               count++;
-                               if (count > 40)
-                                       break;
-
-                       }
-                       if (bufCover.endsWith(" # "))
-                               bufCover = bufCover.substring(0, 
bufCover.length()-3).trim();
-
-                       String buf = "";
-                       count = 0;
-                       // only up to 40 instances of phrases per 1-st level 
topic
-                       for(ParseTreeChunk ch: ptcList){
-                               buf+=ch.toWordOnlyString()+ "|";
-                               count++;
-                               if (count > 40)
-                                       break;
-                       }
-                       
-                       //TODO uncomment
-                       //t = spell.getSpellCheckResult(t);
-                       report.add(new String[]{extractHeadNounFromPhrase(t), 
t, bufCover, buf //, std_group.get(t).toString()
-                       });
-               }
-               
-               
-               ProfileReaderWriter.writeReport(report, reportName);
-       }
-       // get a last word from a phrase (supposed to be a head noun)
-       private String extractHeadNounFromPhrase(String topic){
-               String[] tops = topic.split(" ");
-               int len = tops.length;
-               if (len>1){
-                       return tops[len-1];
-               }
-               else return topic;
-       }
-
-       // get last elem of a list
-       private String getLastElement(List<String> arrayList ){
-               if (arrayList != null && !arrayList.isEmpty()) {
-                       return arrayList.get(arrayList.size()-1);
-               }
-               return null;
-       }
-       /*
-        * Using Bing API to check if an extracted phrase can be found on the 
web, therefore is a meaningful phrase 
-        */
-       public List<String> verifyTopic(){
-               Set<String> phrases = freq.keySet();
-               List<String> approvedPhrases = new ArrayList<String>();
-               for(String p: phrases){
-                       List<HitBase> hits = runner.runSearch("\""+p+"\"");
-                       for(HitBase h: hits){
-                               String lookup = h.getTitle() + " " + 
h.getAbstractText();
-                               if (lookup.indexOf(p)>-1){
-                                       approvedPhrases.add(p);
-                                       break;
-                               }
-                       }
-               }
-               return approvedPhrases;
-       }
-
-       public Set<String> getPhraseLookup(){
-               return freq.keySet();
-       }
-
-       // using phrase frequency to filter phrases
-       public boolean isAcceptablePhrase(String phrase){
-               Integer count = freq.get(phrase.toLowerCase().trim());
-               if (count==null)
-                       return false;
-
-               if (count>0 && count < 10000)
-                       return true;
-               return false;
-       }
-
-       public static void main(String[] args){
-               LinguisticPhraseManager man = new  LinguisticPhraseManager(
-                               
"/Users/bgalitsky/Documents/workspace/move_com/phrasesOfInterest.csv");
-               man.doLingGrouping();
-               man.generateGroupingReport("topics_groups7_mergedHeads.csv");
-               List<String> stdTopics = man.formStandardizedTopic();
-               man.applyLastRoundOfAggregation();
-               man.generateStdTopicReport("std_topics7_mergedHeads.csv");
-               System.out.println(stdTopics);
-
-       }
-}


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
deleted file mode 100644
index b766c7c..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.VerbNetProcessor;
-import 
opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
-import opennlp.tools.similarity.apps.utils.Pair;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringUtils;
-
-public class NamedEntityExtractor {
-       protected static Matcher matcher;
-       private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
-       protected ArrayList<File> queue = new ArrayList<File>();
-       protected static PT2ThicketPhraseBuilder phraseBuilder;
-       protected static SentimentVocab sVocab = SentimentVocab.getInstance();
-       String resourceDirSentimentList = null;
-       Set<String> sentimentVcb = new HashSet<String> ();
-
-       static {
-               synchronized (NamedEntityExtractor.class) {
-                       matcher = new Matcher();
-                       phraseBuilder = new PT2ThicketPhraseBuilder();
-               }
-       }
-
-       public NamedEntityExtractor(){
-               try {
-                       resourceDirSentimentList = new File( "." 
).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
-               } catch (IOException e) {
-                       e.printStackTrace();
-               }
-               List<String[]> sentimentList=null;
-               sentimentList = 
ProfileReaderWriter.readProfiles(resourceDirSentimentList);
-               for(String[] line: sentimentList){
-                       sentimentVcb.add(line[0]);
-               }
-       }
-
-       protected boolean isSentimentWord(String word){
-               if (sentimentVcb.contains(word))
-                       return true;
-               else
-                       return false;           
-       }
-
-       public EntityExtractionResult extractEntities(String para){
-               List<List<ParseTreeNode>> extractedNERs = new 
ArrayList<List<ParseTreeNode>>();
-               List<String> extractedNERsWords = new ArrayList<String>();
-               List<List<ParseTreeNode>> extractedSentimentPhrases = 
-                               new ArrayList<List<ParseTreeNode>>();
-               EntityExtractionResult result = new EntityExtractionResult();
-
-               ParseThicket pt = null;
-
-               System.out.println("Processing paragraph of length 
"+para.length() + " | "+ para);
-               pt = matcher.buildParseThicketFromTextWithRST(para);
-               List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
-
-
-               for(List<ParseTreeNode> sentence: nodeList){
-                       //System.out.println("   Processing sentence: "+ 
sentence);
-                       boolean bInsideNER = false; 
-                       String currentPhrase = "";
-                       List<ParseTreeNode> currentPhraseNode = new 
ArrayList<ParseTreeNode>(); 
-                       for(ParseTreeNode word: sentence){
-                               if (isNERforPhraseExtraction(word)){
-                                       //System.out.println("++Found word 
="+word + " | NER="+ word.getNe());
-                                       if (bInsideNER){
-                                               currentPhrase += " 
"+word.getWord();
-                                               currentPhraseNode.add(word);
-                                       } else {
-                                               bInsideNER=true;
-                                               currentPhrase = word.getWord();
-                                               currentPhraseNode.add(word);
-                                       }
-                               } else {
-                                       if (bInsideNER){
-                                               if (currentPhrase.indexOf(' 
')>-1) // at least two tokens
-                                                       
extractedNERsWords.add(currentPhrase);
-                                                       
extractedNERs.add(currentPhraseNode);
-                                               currentPhrase = "";
-                                               bInsideNER=false;
-                                       } else {
-                                               // do nothing, continue scan
-                                       }
-                               }
-                       }
-                       if (currentPhrase.length()>1 && currentPhrase.indexOf(' 
')>-1){
-                               extractedNERs.add(currentPhraseNode);
-                               extractedNERsWords.add(currentPhrase);
-                       }
-
-                       Set<String> foundSentimentWords = new HashSet<String>();
-                       // now we extract phrases
-                       List<List<ParseTreeNode>> phrases = pt.getPhrases();
-                       for(List<ParseTreeNode> phrase: phrases){
-                               // find a noun phrase under sentiment
-                               try {
-                                       for(int i = phrase.size()-1; i>-1; i--){
-                                               ParseTreeNode word = 
phrase.get(i);
-                                               if 
((isSentimentWord(word.getWord()) ||
-                                                               
sVocab.isSentimentWord(word.getWord()) && 
!foundSentimentWords.contains(word.getWord()) )){
-                                                       
foundSentimentWords.add(word.getWord());
-                                                       
System.out.println("Sentim = " + word.getWord() + " | Found opinionated phrase 
"+phrase.toString());
-                                                       if (phrase.size()>1 && 
phrase.size()<7)
-                                                               
extractedSentimentPhrases.add(phrase);                  
-                                                       break;
-                                               }
-                                       }
-                               } catch (Exception e) {
-                                       e.printStackTrace();
-                               }
-                       }
-
-               } 
-               
-               extractedSentimentPhrases = 
reduceExtractedPhrases(extractedSentimentPhrases);
-               
-               result.setExtractedNER(extractedNERs);
-               result.setExtractedNERWords(extractedNERsWords);
-               result.setExtractedSentimentPhrases(extractedSentimentPhrases);
-               return result;
-       }
-
-       private List<List<ParseTreeNode>> 
reduceExtractedPhrases(List<List<ParseTreeNode>> extractedSentimentPhrases) {
-           List<Integer> idsToDelete = new ArrayList<Integer>();
-               for(int i = 0; i<extractedSentimentPhrases.size(); i++){
-                       for(int j = i+1; j<extractedSentimentPhrases.size(); 
j++){
-                               String phrStr1 = 
ParseTreeNode.toWordString(extractedSentimentPhrases.get(i));
-                               String phrStr2 = 
ParseTreeNode.toWordString(extractedSentimentPhrases.get(j));
-                               if (phrStr1 .indexOf(phrStr2 )>-1)
-                                       idsToDelete.add(j);
-                       }
-               }
-               List<List<ParseTreeNode>> resultPhrases = new 
ArrayList<List<ParseTreeNode>>();
-               for(int i = 0; i<extractedSentimentPhrases.size(); i++){
-                       if (!idsToDelete.contains(i))
-                               resultPhrases 
.add(extractedSentimentPhrases.get(i));
-               }
-           return resultPhrases ;
-    }
-
-       private boolean isNERforPhraseExtraction(ParseTreeNode word){
-               if ((word.getNe().equals("ORGANIZATION") 
||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
-                               (word.getPos().startsWith("NN") || 
word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
-                                               word.getPos().startsWith("JJ") 
|| word.getPos().startsWith("DT")  ))
-                       return true;
-
-               return false;
-
-       }
-
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
deleted file mode 100644
index cb04154..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-
-public class PersonExtractor extends NamedEntityExtractor {
-       private boolean isNERforPhraseExtraction(ParseTreeNode word){
-               if ((word.getNe().equals("PERSON") ) &&
-                               (word.getPos().startsWith("NN") || 
word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
-                                               word.getPos().startsWith("JJ") 
|| word.getPos().startsWith("DT")  ))
-                       return true;
-
-               return false;
-
-       }
-       
-       public EntityExtractionResult extractEntities(String para){
-               List<List<ParseTreeNode>> extractedNERs = new 
ArrayList<List<ParseTreeNode>>();
-               List<String> extractedNERsWords = new ArrayList<String>();
-               List<List<ParseTreeNode>> extractedSentimentPhrases = 
-                               new ArrayList<List<ParseTreeNode>>();
-               EntityExtractionResult result = new EntityExtractionResult();
-
-               ParseThicket pt = null;
-
-               System.out.println("Processing paragraph of length 
"+para.length() + " | "+ para);
-               pt = matcher.buildParseThicketFromTextWithRST(para);
-               List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
-
-
-               for(List<ParseTreeNode> sentence: nodeList){
-                       System.out.println("   Processing sentence: "+ 
sentence);
-                       boolean bInsideNER = false; 
-                       String currentPhrase = "";
-                       List<ParseTreeNode> currentPhraseNode = new 
ArrayList<ParseTreeNode>(); 
-                       for(ParseTreeNode word: sentence){
-                               if (isNERforPhraseExtraction(word)){
-                                       System.out.println("++Found word 
="+word + " | NER="+ word.getNe());
-                                       if (bInsideNER){
-                                               currentPhrase += " 
"+word.getWord();
-                                               currentPhraseNode.add(word);
-                                       } else {
-                                               bInsideNER=true;
-                                               currentPhrase = word.getWord();
-                                               currentPhraseNode.add(word);
-                                       }
-                               } else {
-                                       if (bInsideNER){
-                                               if (currentPhrase.indexOf(' 
')>-1) // at least two tokens
-                                                       
extractedNERsWords.add(currentPhrase);
-                                                       
extractedNERs.add(currentPhraseNode);
-                                               currentPhrase = "";
-                                               bInsideNER=false;
-                                       } else {
-                                               // do nothing, continue scan
-                                       }
-                               }
-                       }
-                       if (currentPhrase.length()>1 && currentPhrase.indexOf(' 
')>-1){
-                               extractedNERs.add(currentPhraseNode);
-                               extractedNERsWords.add(currentPhrase);
-                       }
-
-                       Set<String> foundSentimentWords = new HashSet<String>();
-                       // now we extract phrases
-                       List<List<ParseTreeNode>> phrases = 
phraseBuilder.buildPT2ptPhrases(pt);
-                       for(List<ParseTreeNode> phrase: phrases){
-                               // find a noun phrase under sentiment
-                               try {
-                                       for(int i = phrase.size()-1; i>-1; i--){
-                                               ParseTreeNode word = 
phrase.get(i);
-                                               if 
((isSentimentWord(word.getWord()) ||
-                                                               
sVocab.isSentimentWord(word.getWord()) && 
!foundSentimentWords.contains(word.getWord()) )){
-                                                       
foundSentimentWords.add(word.getWord());
-                                                       
System.out.println("Found opinionated phrase "+phrase.toString());
-                                                       
extractedSentimentPhrases.add(phrase);                  
-                                                       break;
-                                               }
-                                       }
-                               } catch (Exception e) {
-                                       e.printStackTrace();
-                               }
-                       }
-
-               } 
-               result.setExtractedNER(extractedNERs);
-               result.setExtractedNERWords(extractedNERsWords);
-               result.setExtractedSentimentPhrases(extractedSentimentPhrases);
-               return result;
-       }
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
deleted file mode 100644
index 86cd2dc..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.VerbNetProcessor;
-import 
opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
-import opennlp.tools.similarity.apps.utils.Pair;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringUtils;
-
-public class SentencePhraseGivenAWordGetter {
-       protected static Matcher matcher;
-       protected ArrayList<File> queue = new ArrayList<File>();
-       protected static PT2ThicketPhraseBuilder phraseBuilder;
-
-
-       static {
-               synchronized (SentencePhraseGivenAWordGetter.class) {
-                       matcher = new Matcher();
-                       phraseBuilder = new PT2ThicketPhraseBuilder();
-               }
-       }
-
-       public SentencePhraseGivenAWordGetter(){
-       }
-
-       public EntityExtractionResult extractEntities(String para, String 
keyword){
-               List<List<ParseTreeNode>> extractedPhrases = new 
ArrayList<List<ParseTreeNode>>();
-
-               EntityExtractionResult result = new EntityExtractionResult();
-
-               ParseThicket pt =  
matcher.buildParseThicketFromTextWithRST(para);
-
-               List<List<ParseTreeNode>> phrases = pt.getPhrases();
-               for(List<ParseTreeNode> phrase: phrases){
-                       // find a noun phrase under sentiment
-                       try {
-                               for(int i = 0; i<phrase.size(); i++){
-                                       ParseTreeNode word = phrase.get(i);
-                                       if 
(word.getWord().toLowerCase().equals(keyword.toLowerCase())){
-                                               extractedPhrases.add(phrase);   
        
-                                               break;
-                                       }
-                               }
-                       } catch (Exception e) {
-                               e.printStackTrace();
-                       }
-               }
-
-               result.setExtractedSentimentPhrases(extractedPhrases);
-               return result;
-       }
-
-
-       public static void main(String[] args){
-               SentencePhraseGivenAWordGetter self = new 
SentencePhraseGivenAWordGetter();
-               EntityExtractionResult result = self.extractEntities("However i 
put a foam panel inside the main case if i do not have my headphones or an iPad 
to brace the mac book", 
-                               "panel");
-               System.out.println(result.getExtractedSentimentPhrases());
-       }
-}
-
-
-/*
- 3 phrases are given as a result
- * 
-[[<2>SBAR'i':FW, <3>SBAR'put':VBD, <4>SBAR'a':DT, <5>SBAR'foam':NN, 
<6>SBAR'panel':NN, <7>SBAR'inside':IN, <8>SBAR'the':DT, <9>SBAR'main':JJ, 
<10>SBAR'case':NN, <11>SBAR'if':IN, <12>SBAR'i':FW, 
-<13>SBAR'do':VBP, <14>SBAR'not':RB, <15>SBAR'have':VB, <16>SBAR'my':PRP$, 
<17>SBAR'headphones':NNS, <18>SBAR'or':CC, <19>SBAR'an':DT, <20>SBAR'iPad':NN, 
<21>SBAR'to':TO, 
-<22>SBAR'brace':VB, <23>SBAR'the':DT, <24>SBAR'mac':NN, <25>SBAR'book':NN], 
-
-[<3>VP'put':VBD, <4>VP'a':DT, <5>VP'foam':NN, <6>VP'panel':NN, 
<7>VP'inside':IN, <8>VP'the':DT, <9>VP'main':JJ, <10>VP'case':NN, 
<11>VP'if':IN, <12>VP'i':FW, <13>VP'do':VBP, 
-<14>VP'not':RB, <15>VP'have':VB, <16>VP'my':PRP$, <17>VP'headphones':NNS, 
<18>VP'or':CC, <19>VP'an':DT, <20>VP'iPad':NN, <21>VP'to':TO, <22>VP'brace':VB, 
<23>VP'the':DT, 
-<24>VP'mac':NN, <25>VP'book':NN], 
-
-[<4>NP'a':DT, <5>NP'foam':NN, <6>NP'panel':NN]]
-
-*/

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
deleted file mode 100644
index 1efe428..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
+++ /dev/null
@@ -1,41 +0,0 @@
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import edu.stanford.nlp.ling.CoreAnnotation;
-
-import edu.stanford.nlp.trees.Tree;
-
-/**
- * Annotations specific to the Sentiment project.  In case there are
- * other projects that use the same RNN machinery, including the RNN
- * core annotations, this lets a sentence have a tree attached where
- * that tree specifically has the sentiment annotations.
- *
- * @author John Bauer
- */
-public class SentimentCoreAnnotations {
-
-  /**
-   * A tree which contains the annotations used for the Sentiment
-   * task.  After forwardPropagate has been called, the Tree will have
-   * prediction, etc. attached to it.
-   */
-  public static class SentimentAnnotatedTree implements CoreAnnotation<Tree> {
-    @Override
-    public Class<Tree> getType() {
-      return Tree.class;
-    }
-  }
-
-
-  /**
-   * The final label given for a sentence.  Set by the
-   * SentimentAnnotator and used by various forms of text output.
-   */
-  public static class SentimentClass implements CoreAnnotation<String> {
-    @Override
-    public Class<String> getType() {
-      return String.class;
-    }
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
deleted file mode 100755
index ad0f791..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Hashtable;
-import java.util.Iterator;
-import java.util.List;
-
-import opennlp.tools.stemmer.PStemmer;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-
-public class StopList {
-    private static StopList m_StopList = null;
-    private static Hashtable<String, HashSet<String>> m_stopHash = new 
Hashtable<String, HashSet<String>>();
-    public static final Log logger = LogFactory.getLog(StopList.class);
-    private static final String DEFAULT_STOPLIST = "STANDARD";
-    public static String resourceDir =null;
-    private static PStemmer stemmer = new PStemmer();
-
-    static {
-        synchronized (StopList.class) {
-            try {
-                LoadStopList();
-            } catch (IOException e) {
-                // TODO Auto-generated catch block
-                e.printStackTrace();
-            }
-        }
-    }
-
-    /**
-     * Get the StopList singleton instance.
-     * 
-     * @return The StopList
-     */
-    static public synchronized StopList getInstance() {
-
-        if (m_StopList == null) {
-            m_StopList = new StopList();
-
-            try {
-                m_StopList.LoadStopList();
-            } catch (Exception e) {
-
-            }
-        }
-        return m_StopList;
-    }
-
-    static public synchronized StopList getInstance(String dir) {
-        resourceDir = dir;
-        if (m_StopList == null) {
-            m_StopList = new StopList();
-
-            try {
-                m_StopList.LoadStopList();
-            } catch (Exception e) {
-
-            }
-        }
-        return m_StopList;
-    }
-
-    private static void LoadStopList() throws IOException {
-
-        File dir = new File(resourceDir + "/maps");
-        String[] children = dir.list();
-        if (children == null) {
-            System.err.println("Problem reading Stop Lists!");
-        } else {
-            for (int i = 0; i < children.length; i++) {
-                String fn = children[i];
-                if (fn.endsWith(".vcb")) {
-                    String fileName = resourceDir + "/maps/" + fn;
-                    File f = new File(fileName);
-                    loadStopListFile(f);
-                }
-            }
-        }
-    }
-
-    private static void loadStopListFile(File f) throws FileNotFoundException {
-
-        FileReader fileReader = new FileReader(f);
-        BufferedReader in = new BufferedReader(fileReader);
-
-        String str = new String();
-        boolean fLine = true;
-        HashSet<String> t = new HashSet<String>();
-        String listName = "";
-
-        try {
-            while ((str = in.readLine()) != null) {
-                if (fLine && str.length() > 0) {
-                    fLine = false;
-                    listName = str;
-                } else {
-                    t.add(str);
-                }
-            }
-        } catch (IOException ioe) {
-
-        } finally {
-            try {
-                if (in != null) {
-                    in.close();
-                }
-                if (fileReader != null) {
-                    fileReader.close();
-                }
-            } catch (IOException ioe) {
-                ioe.printStackTrace();
-            }
-        }
-
-        if (listName.length() > 0) {
-            HashSet<String> l = m_stopHash.get(listName);
-            if (l != null) {
-                synchronized (l) {
-                    m_stopHash.put(listName, t);
-                }
-            } else {
-                m_stopHash.put(listName, t);
-            }
-        }
-    }
-
-    /**
-     * Is the given word in the stop words list? Uses the defaut "STANDARD"
-     * stoplist
-     * 
-     * @param str
-     *            The word to check
-     * @return is a stop word
-     */
-    public static boolean isStopWord(String str) {
-        boolean retVal = false;
-        if (m_stopHash.containsKey(DEFAULT_STOPLIST))
-            retVal = m_stopHash.get(DEFAULT_STOPLIST).contains(str);
-        return retVal;
-    }
-
-    public static boolean isFirstName(String str) {
-        boolean retVal = false;
-        if (m_stopHash.containsKey("FIRST_NAMES"))
-            retVal = m_stopHash.get("FIRST_NAMES").contains(str.toUpperCase());
-        return retVal;
-    }
-
-    public String getRandomFirstName() {
-        HashSet<String> firstNames = m_stopHash.get("FIRST_NAMES");
-        int indexRand = (int) (Math.random() * new Float(firstNames.size()));
-        Iterator iter = firstNames.iterator();
-        for (int i = 0; i < indexRand; i++) {
-            iter.next();
-        }
-        return ((String) iter.next()).toLowerCase();
-    }
-
-    public static boolean isCommonWord(String str) {
-        if (str == null)
-            return true;
-        String stemmed="";
-               try {
-                       stemmed = stemmer.stem(str).toLowerCase();
-               } catch (Exception e) {
-                       //stemming exceptions are not informative, jiust ignore 
wthis word
-                       //e.printStackTrace();
-               }
-
-        boolean retVal = false;
-        if (m_stopHash.containsKey("ENG_DICT"))
-            retVal = m_stopHash.get("ENG_DICT").contains(stemmed);
-        return retVal;
-    }
-
-    public boolean isCommonEventWord(String str) {
-        if (str == null)
-            return true;
-        boolean retVal = false;
-
-        try {
-            String stemmed = str.toLowerCase();
-
-            if (m_stopHash.containsKey("fREQUENTEVENTNAMEWORDS"))
-                retVal = m_stopHash.get("fREQUENTEVENTNAMEWORDS").contains(
-                        stemmed);
-        } catch (Exception e) {
-            // TODO Auto-generated catch block
-            e.printStackTrace();
-        }
-        return retVal;
-    }
-
-    /**
-     * Is the given word in the stop words list provided?
-     * 
-     * @param str
-     *            The word to check
-     * @param stop_list
-     *            the name of the stoplist to check against
-     * @return is a stop word
-     */
-    public static boolean isStopWord(String str, String stop_list) {
-        boolean retVal = false;
-        if (m_stopHash.containsKey(stop_list))
-            retVal = m_stopHash.get(stop_list).contains(str);
-        return retVal;
-    }
-
-    public boolean isStopWordAll(String str) {
-        return isStopWord(str);
-    }
-
-    public HashSet<String> getStopListMap(String name) {
-        return m_stopHash.get(name);
-    }
-
-    public static List<List<String>> preFilterCommonEnglishExpressions(
-            List<String> userLikes) {
-        List<List<String>> results = new ArrayList<List<String>>();
-
-        List<String> resultUserLikes = new ArrayList<String>(), 
potentialCategs = new ArrayList<String>();
-        if (userLikes.size() < 6) {// too short, do not filter
-            results.add(userLikes);
-            results.add(potentialCategs);
-            return results;
-
-        }
-
-        for (String like : userLikes) {
-            like = like.toLowerCase();
-            if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
-                logger.info("removed isAlphanumeric " + like);
-                continue;
-            }
-
-            if (StringUtils.isNumeric(like)) {
-                logger.info("removed isNumericSpace " + like);
-                continue;
-            }
-
-            if (like.length() < 4) {
-                logger.info("removed too short likes " + like);
-                continue;
-            }
-            boolean existFirstName = false, allWordsCommonEnglish = true, 
bStop = false;
-            String[] comps = like.split(" ");
-            StringBuffer buf = new StringBuffer();
-            for (String word : comps) {
-                boolean isCommon = isCommonWord(word);
-                boolean isName = isFirstName(word);
-                if (!isCommon)
-                    allWordsCommonEnglish = false;
-                if (isName)
-                    existFirstName = true;
-                if (isStopWord(word) || word.length() < 3)
-                    bStop = true;
-                else
-                    buf.append(word + " ");
-            } // / does not have to include stop word
-            if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
-                logger.info("moved to category:  
NoFirstName+AllCommonEng+ShorterThan3 "
-                        + like);
-
-                continue;
-            }
-            if (!existFirstName && allWordsCommonEnglish && comps.length == 1) 
{
-                logger.info("moved to category: 
NoFirstName+AllCommonEng+Short1word "
-                        + like);
-                potentialCategs.add(like);
-                continue;
-            }
-
-            if (existFirstName && comps.length == 1) {
-                logger.info("removed : only first name, no last name " + like);
-
-                continue;
-            }
-
-            resultUserLikes.add(buf.toString().trim());
-
-        }
-
-        resultUserLikes = new ArrayList<String>(new HashSet<String>(
-                resultUserLikes));
-        if (resultUserLikes.size() > 1) {
-            results.add(resultUserLikes);
-            results.add(potentialCategs);
-            return results;
-        }
-
-        else {// do not do reduction
-            results.add(userLikes);
-            results.add(potentialCategs);
-            return results;
-        }
-    }
-
-    public static boolean isAcceptableIndividualLikes(String like) {
-        StopList finder = StopList.getInstance();
-        like = like.toLowerCase();
-        if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
-            logger.info("removed isAlphanumeric " + like);
-            return false;
-        }
-
-        if (StringUtils.isNumeric(like)) {
-            logger.info("removed isNumericSpace " + like);
-            return false;
-        }
-
-        if (like.length() < 4) {
-            logger.info("removed too short likes " + like);
-            return false;
-        }
-        boolean existFirstName = false, allWordsCommonEnglish = true, bStop = 
false;
-        String[] comps = like.split(" ");
-        StringBuffer buf = new StringBuffer();
-        for (String word : comps) {
-            boolean isCommon = finder.isCommonWord(word);
-            boolean isName = finder.isFirstName(word);
-            if (!isCommon)
-                allWordsCommonEnglish = false;
-            if (isName)
-                existFirstName = true;
-            if (finder.isStopWord(word) || word.length() < 3)
-                bStop = true;
-            else
-                buf.append(word + " ");
-        } // / does not have to include stop word
-        if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
-            logger.info("  NoFirstName+AllCommonEng+ShorterThan3 " + like);
-
-            return false;
-        }
-        if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
-            logger.info(" NoFirstName+AllCommonEng+Short1word " + like);
-
-            return false;
-        }
-
-        if (existFirstName && comps.length == 1) {
-            logger.info("removed : only first name, no last name " + like);
-
-            return false;
-        }
-
-        return true;
-    }
-
-    @SuppressWarnings("all")
-    public static void main(String[] args) {
-
-        StopList list = StopList
-                
.getInstance("/Users/borisgalitsky/Documents/workspace/opennlp-similarity/src/test/resources/");
-        Boolean b = list.isCommonWord("demonstration");
-
-        String fname = list.getRandomFirstName();
-
-        b = list.isCommonEventWord("tour");
-        b = list.isCommonEventWord("dance");
-        b = list.isCommonEventWord("salsa");
-        b = list.isCommonEventWord("center");
-        b = list.isCommonEventWord("family");
-
-      
-
-        b = isAcceptableIndividualLikes("forest glen");
-        b = isAcceptableIndividualLikes("drive");
-        b = isAcceptableIndividualLikes("house");
-        b = isAcceptableIndividualLikes("Timothy Kloug");
-        b = isAcceptableIndividualLikes("Mamma Mia");
-
-    }
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
deleted file mode 100644
index f4d56aa..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
+++ /dev/null
@@ -1,117 +0,0 @@
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang3.StringUtils;
-
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-
-public class TopicAsOpinionMinerRunner {
-       private List<File> queue;
-       private final static String reviewSource = 
"/Users/bgalitsky/Documents/solr/example/exampledocs/publication_page0.json";
-       NamedEntityExtractor neExtractor = new NamedEntityExtractor();
-       Set<String> allPhrases = new HashSet<String>();
-       
-       public void processJSONfileWithReviews(){
-               List<String[]> report = new ArrayList<String[]>();
-               report.add(new String[] { "text", "phrases of potential 
interest list" , });
-
-               
-               String content=null;
-               try {
-                       content = FileUtils.readFileToString(new 
File(reviewSource));
-               } catch (IOException e) {
-                       e.printStackTrace();
-               }
-               String[] texts = StringUtils.substringsBetween(content, 
"summary\":\"", "\"");
-               for(String text: texts){
-                       report.clear();
-                       EntityExtractionResult result = 
neExtractor.extractEntities(text);
-                       //report.add(new String[]{text});
-                       allPhrases.addAll(result.extractedNERWords);
-                       allPhrases = new HashSet<String>(allPhrases);
-                       for(String p: allPhrases){
-                               report.add(new String[]{p});
-                       }
-                       /*
-                       String[] phrases = 
(String[])result.extractedNERWords.toArray(new String[0]);
-                       if (phrases!=null && phrases.length>0)
-                               report.add(phrases);
-                       */
-                       
/*report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
-                       List<String> stringPhrases = new ArrayList<String>(),
-                                       nodePhrases = new ArrayList<String>();
-                       for(List<ParseTreeNode> chList: 
result.extractedSentimentPhrases){
-                               String buf = "", nodeBuf="";
-                               for(ParseTreeNode ch: chList){
-                                       buf+=ch.getWord()+ " ";
-                                       nodeBuf+=ch.toString()+ " ";
-                               }
-                               stringPhrases.add(buf.trim());
-                               nodePhrases.add(nodeBuf.trim());
-                       }
-                       report.add((String[])stringPhrases.toArray(new 
String[0]));
-                       report.add((String[])nodePhrases.toArray(new 
String[0]));
-                       */
-                       
-                       ProfileReaderWriter.writeReport(report, 
"phrasesExtracted3.csv");
-               }
-       }
-
-       private void addFiles(File file) {
-
-               if (!file.exists()) {
-                       System.out.println(file + " does not exist.");
-
-                       if (file.isDirectory()) {
-                               for (File f : file.listFiles()) {
-                                       if (f.getName().startsWith("."))
-                                               continue;
-                                       addFiles(f);
-                                       System.out.println(f.getName());
-                               }
-                       } else {
-                               queue.add(file);
-
-                       }
-               }
-       }
-       
-       public static void main(String[] args){
-               TopicAsOpinionMinerRunner runner = new 
TopicAsOpinionMinerRunner();
-               runner.processJSONfileWithReviews();
-
-       }
-}
-
-/*
-       public void processDirectory(String path){
-               List<String[]> report = new ArrayList<String[]>();
-               report.add(new String[] { "filename", "named entity list", 
"phrases of potential interest list" });
-
-               List<String> allNamedEntities = new ArrayList<String>();
-
-               addFiles(new File(path));
-               for(File f: queue){
-                       List<String> entities = (List<String>) 
extractEntities(f.getAbsolutePath()).getFirst();
-                       List<String> opinions = (List<String>) 
extractEntities(f.getAbsolutePath()).getSecond();
-                       report.add(new String[]{ f.getName(), 
entities.toString(),  opinions.toString()});      
-                       ProfileReaderWriter.writeReport(report, 
"nameEntitiesExtracted.csv");
-
-                       allNamedEntities.addAll(entities);
-
-                       allNamedEntities = new ArrayList<String>(new 
HashSet<String> (allNamedEntities ));
-
-
-               }
-               ProfileReaderWriter.writeReport(report, 
"nameEntitiesTopicsOfInterestExtracted.csv");
-       } 
-} */

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
deleted file mode 100644
index a704f22..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.matching.Matcher;
-
-public class TopicPhraseExtractor {
-       Matcher matcher = new Matcher();
-
-       // sentiment vocabulary for phrase under the focus of sentiment
-       SentimentVocab sVocab = SentimentVocab.getInstance();
-       //This is used to create an XML with phrases. The same class for acro  
& phrases
-
-       public EntityExtractionResult extractEntities(String para){
-               EntityExtractionResult result = new EntityExtractionResult();
-               List<String> extractedNerPhrasesStr = new ArrayList<String>(), 
-                               extractedNerExactStr = new ArrayList<String>(),
-                               extractedSentimentPhrasesStr = 
-                               new ArrayList<String>(), 
extractedNONSentimentPhrasesStr = 
-                               new ArrayList<String>(), extractedNerPhraseTags 
= new ArrayList<String>();
-               // no need to change to extract more/less phrases
-               ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(para);
-
-               List<List<ParseTreeNode>> extractedSentimentPhrases = new 
ArrayList<List<ParseTreeNode>>(), 
-                               extractedNONSentimentPhrases = new 
ArrayList<List<ParseTreeNode>>(),
-                               extractedNerPhrases = new 
ArrayList<List<ParseTreeNode>>(),
-                                               extractedNerExactPhrases= new 
ArrayList<List<ParseTreeNode>>();
-               //TODO document examples / cases for each rule
-               // now we extract phrases
-               List<List<ParseTreeNode>> phrases = pt.getPhrases();
-               List<Float> sentimentProfile = pt.getSentimentProfile();
-               for(List<ParseTreeNode> phrase: phrases){
-
-                       // find a noun phrase under sentiment
-                       boolean bAccept = true, bNER = false;
-
-                       String phraseStr = asString(phrase);
-
-
-                       if (!phrase.get(0).getPhraseType().equals("NP") && 
!phrase.get(0).getPhraseType().equals("VP") )        
-                               bAccept = false;
-
-                       boolean bSentiment = false;
-                       for(ParseTreeNode word: phrase){
-                               if (sVocab.isSentimentWord(word.getWord())){
-                                       bSentiment=true;
-                                       break;
-                               }
-                       }
-
-                       String nerTagConfirmed = null;
-                       for(ParseTreeNode word: phrase){
-                               // no Named Entity
-                               String nerTag = isNERforPhraseExtraction(word);
-                               if (nerTag!=null){
-                                       bNER = true;
-                                       nerTagConfirmed = nerTag;
-                               }
-
-                               // no numbers nor prepositions
-                               if (word.getPos().startsWith("CD") || 
word.getPos().indexOf("PRP")>-1 )
-                                       bAccept = false;
-                       }
-                       if (!bAccept)
-                               continue;
-                       // was 7 -> 2
-                       if (phrase.size()>7 || phrase.size()<2)
-                               bAccept = false;
-
-                       if (phrase.get(0).getPos().equals("DT") && 
phrase.size()<3)
-                               bAccept = false;
-                       if (!bAccept)
-                               continue;
-
-                       String cleanedPhraseStr = cleanPhraseString(phraseStr);
-                       if (cleanedPhraseStr==null)
-                               bAccept = false;
-
-                       if (bAccept){
-                               if (bNER){
-                                       extractedNerPhrases.add(phrase);
-                                       extractedNerPhrasesStr.add(phraseStr);
-                                       
extractedNerPhraseTags.add(nerTagConfirmed );
-                                       // forming exact NER
-                                       List<ParseTreeNode> phraseNER_exact = 
new ArrayList<ParseTreeNode>();
-                                       String nerExactStr = "";
-                                       for(ParseTreeNode word: phrase){
-                                               String ner = 
isNERforPhraseExtraction(word);
-                                               if (ner!=null && 
ner.equals(nerTagConfirmed)){
-                                                       
phraseNER_exact.add(word);
-                                                       nerExactStr+=" 
"+word.getWord();
-                                               }
-                                       }
-                                       nerExactStr.trim();
-                                       
extractedNerExactPhrases.add(phraseNER_exact);
-                                       extractedNerExactStr.add(nerExactStr);
-                               }
-                               else if (bSentiment) {
-                                       
extractedSentimentPhrasesStr.add(cleanedPhraseStr);                             
        
-                                       extractedSentimentPhrases.add(phrase);
-                               } else {
-                                       
extractedNONSentimentPhrasesStr.add(cleanedPhraseStr);                          
        
-                                       
extractedNONSentimentPhrases.add(phrase);
-                               }
-                       }
-               } 
-
-               result.setExtractedSentimentPhrases(extractedSentimentPhrases);
-               
result.setExtractedSentimentPhrasesStr(extractedSentimentPhrasesStr);
-
-               
result.setExtractedNONSentimentPhrases(extractedNONSentimentPhrases);
-               
result.setExtractedNONSentimentPhrasesStr(extractedNONSentimentPhrasesStr);
-               
-               result.setExtractedNerPhrases(extractedNerPhrases);
-               result.setExtractedNerPhrasesStr(extractedNerPhrasesStr);
-               result.setExtractedNerPhraseTags(extractedNerPhraseTags);
-               
-               result.setExtractedNerExactPhrases(extractedNerExactPhrases);
-               result.setExtractedNerExactStr(extractedNerExactStr);
-
-               result.setSentimentProfile(sentimentProfile );
-
-               return result;
-       }
-
-
-
-
-
-
-       private String cleanPhraseString(String phraseStr) {
-               String p = phraseStr.toLowerCase();
-
-               if (p.startsWith("*") || p.startsWith("&") || p.startsWith("$"))
-                       return null;
-
-               if (p.startsWith("this ") || p.startsWith("other "))
-                       return null;
-
-               if (p.startsWith("a "))
-                       p = p.substring(2, p.length());
-               if (p.startsWith("the "))
-                       p = p.substring(4, p.length());
-               if (p.startsWith(", "))
-                       p = p.substring(2, p.length());
-
-               return p;
-       }
-
-       private String asString(List<ParseTreeNode> phrase) {
-               String buf = "";
-               for(ParseTreeNode p: phrase)
-                       buf+=p.getWord()+" ";
-               return buf.trim();
-       }
-
-       private String isNERforPhraseExtraction(ParseTreeNode word){
-               if (word.getNe() == null)
-                       return null;
-               
-
-               if (!(word.getPos().startsWith("NN") || 
word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
-                               word.getPos().startsWith("JJ") || 
word.getPos().startsWith("DT")))
-                       return null;
-                               
-
-               if (word.getNe().equals("ORGANIZATION"))
-                               return "ORGANIZATION";
-               if(word.getNe().equals("LOCATION"))
-                       return "LOCATION";
-                                       
-               if(word.getNe().equals("PERSON") ) 
-                       return "PERSON";
-               
-               if(word.getNe().equals("MONEY") ) 
-                       return "MONEY";
-               if(word.getNe().equals("DATE") ) 
-                       return "DATE";
-               if(word.getNe().equals("TIME") ) 
-                       return "TIME";
-
-               return null;
-
-       }
-}
-
-/*
- * NaÃ¯ve  sentiment prediction systems work just by looking at words in 
isolation, giving positive points for positive words and negative points for 
negative words and then summing up these points. That way, the order of words 
is ignored and important information is lost. The deep learning model of 
(Socher et al 2013) builds a representation of whole sentences based on the 
sentence structure. It computes the sentiment based on how words compose the 
meaning of longer phrases. However, in most applications just taking individual 
sentences into account do not give accurate results and rhetoric information 
needs to be taken into account to determine the overall sentiment of a 
paragraph and then back to the individual sentence level.
- */
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
deleted file mode 100644
index 6de3180..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.lang.reflect.Array;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang3.StringUtils;
-
-import au.com.bytecode.opencsv.CSVWriter;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-
-public class TwitterEngineRunner {
-       private List<File> queue;
-       private final static String twSource = 
"/Users/bgalitsky/Documents/workspace/TwitterMiner/data/TwitterArtistsDynamicsTot12_07.csv";
-       TwitterFilter neExtractor = new TwitterFilter();
-       private static int iWind = 80;
-
-       public void processTweetFile(int nRun){
-               List<String[]> report = new ArrayList<String[]>(), ful_less =  
new ArrayList<String[]>();
-               List<String> meaningLESS = new ArrayList<String>(), meaningFUL 
= new ArrayList<String>();
-               report.add(new String[] { "text", "phrases of potential 
interest list" , });
-
-               List<String[]> texts = 
ProfileReaderWriter.readProfiles(twSource);
-               int offset = iWind*nRun;
-               
-               //for(int i=offset; i< offset+iWind; i++){
-                       
-               //      String[] text = texts.get(i);
-               for(String[] text: texts){
-                       List<String> textDeduped = new ArrayList<String>(new 
HashSet<String>(Arrays.asList(text)));
-                       EntityExtractionResult result = null;
-                       if (text==null || text.length<4)
-                               continue;
-
-                       for(int nInLine=3; nInLine<textDeduped.size(); 
nInLine++){
-                               if (textDeduped.get(nInLine).length()>180)
-                                       continue;
-                               
-                               String cleanedTweet = 
textDeduped.get(nInLine).replace("/\\bs\\@+/ig","");
-                               try {
-                                       result = 
neExtractor.extractEntities(cleanedTweet);
-                               } catch (Exception e) {
-                                       e.printStackTrace();
-                                       continue;
-                               }
-                               report.add(new String[]{text[0],text[nInLine]});
-                               
report.add((String[])result.extractedNERWords.toArray(new String[0]));
-                               
//report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
-                               List<String> stringPhrases = new 
ArrayList<String>(),
-                                               nodePhrases = new 
ArrayList<String>();
-                               Boolean bMeaningf = false;
-
-                               //stringPhrases.add(""); nodePhrases.add(""); 
// to make report more readable
-                               for(List<ParseTreeNode> chList: 
result.extractedSentimentPhrases){
-                                       String buf = "", nodeBuf="";
-                                       for(ParseTreeNode ch: chList){
-                                               buf+=ch.getWord()+ " ";
-                                               nodeBuf+=ch.toString()+ " ";
-                                       }
-                                       stringPhrases.add(buf.trim());
-                                       nodePhrases.add(nodeBuf.trim());
-                               }
-                               // selecting MEANINGFULL
-                               if (nodePhrases.size()>1){
-                                       if 
((nodePhrases.get(0).indexOf(">VP'")>-1 || 
nodePhrases.get(0).indexOf(">NNP'")>-1) &&
-                                                       
(nodePhrases.get(1).indexOf(">VP'")>-1 || 
nodePhrases.get(1).indexOf(">NNP'")>-1)){
-                                               bMeaningf = true;
-
-                                       }
-                               }
-
-                               report.add((String[])stringPhrases.toArray(new 
String[0]));
-                               report.add((String[])nodePhrases.toArray(new 
String[0]));
-                               if (bMeaningf){
-                                       report.add(new String[]{"===", 
"MEANINGFUL tweet"});
-                                       if (!meaningFUL.contains(cleanedTweet))
-                                               meaningFUL.add(cleanedTweet);
-                               } else {
-                                       if (!meaningLESS.contains(cleanedTweet))
-                                               meaningLESS.add(cleanedTweet);
-                               }
-
-                               int count = 0;
-                               ful_less.clear();
-                               for(String less: meaningLESS ){
-                                       String fl = "";
-                                       if (count<meaningFUL.size())
-                                               fl = meaningFUL.get(count);
-                                       ful_less.add(new String[]{less, fl});
-                                       count++;
-                               }
-
-                               report.add(new 
String[]{"-----------------------------------------------------"});
-                                       ProfileReaderWriter.writeReport(report, 
"phrasesExtractedFromTweets3_"+nRun+".csv");
-                                       
ProfileReaderWriter.writeReport(ful_less, "ful_lessTweets3_"+nRun+".csv");
-                               
-                       }
-               }
-       }
-
-
-       public static void main(String[] args){
-               TwitterEngineRunner runner = new TwitterEngineRunner();
-               int nRun = Integer.parseInt(args[0]);
-               runner.processTweetFile(nRun);
-
-       }
-}
-
-/*
-       public void processDirectory(String path){
-               List<String[]> report = new ArrayList<String[]>();
-               report.add(new String[] { "filename", "named entity list", 
"phrases of potential interest list" });
-
-               List<String> allNamedEntities = new ArrayList<String>();
-
-               addFiles(new File(path));
-               for(File f: queue){
-                       List<String> entities = (List<String>) 
extractEntities(f.getAbsolutePath()).getFirst();
-                       List<String> opinions = (List<String>) 
extractEntities(f.getAbsolutePath()).getSecond();
-                       report.add(new String[]{ f.getName(), 
entities.toString(),  opinions.toString()});      
-                       ProfileReaderWriter.writeReport(report, 
"nameEntitiesExtracted.csv");
-
-                       allNamedEntities.addAll(entities);
-
-                       allNamedEntities = new ArrayList<String>(new 
HashSet<String> (allNamedEntities ));
-
-
-               }
-               ProfileReaderWriter.writeReport(report, 
"nameEntitiesTopicsOfInterestExtracted.csv");
-       } 
-} */
-
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
deleted file mode 100644
index 0e5053d..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.VerbNetProcessor;
-import 
opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
-import opennlp.tools.similarity.apps.utils.Pair;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringUtils;
-
-public class TwitterFilter {
-       protected static Matcher matcher;
-       private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
-       protected ArrayList<File> queue = new ArrayList<File>();
-       protected static PT2ThicketPhraseBuilder phraseBuilder;
-       protected static SentimentVocab sVocab = SentimentVocab.getInstance();
-       String resourceDirSentimentList = null;
-       Set<String> sentimentVcb = new HashSet<String> ();
-
-       static {
-               synchronized (TwitterFilter.class) {
-                       matcher = new Matcher();
-                       phraseBuilder = new PT2ThicketPhraseBuilder();
-               }
-       }
-
-       public TwitterFilter(){
-               try {
-                       resourceDirSentimentList = new File( "." 
).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
-               } catch (IOException e) {
-                       e.printStackTrace();
-               }
-               List<String[]> sentimentList=null;
-               sentimentList = 
ProfileReaderWriter.readProfiles(resourceDirSentimentList);
-               for(String[] line: sentimentList){
-                       sentimentVcb.add(line[0]);
-               }
-       }
-
-       private boolean isSentimentWord(String word){
-               if (sentimentVcb.contains(word))
-                       return true;
-               else
-                       return false;           
-       }
-
-       public EntityExtractionResult extractEntities(String para){
-               List<List<ParseTreeNode>> extractedNERs = new 
ArrayList<List<ParseTreeNode>>();
-               List<String> extractedNERsWords = new ArrayList<String>();
-               List<List<ParseTreeNode>> extractedSentimentPhrases = 
-                               new ArrayList<List<ParseTreeNode>>();
-               EntityExtractionResult result = new EntityExtractionResult();
-
-               ParseThicket pt = null;
-
-               System.out.println("Processing paragraph of length 
"+para.length() + " | "+ para);
-               pt = matcher.buildParseThicketFromTextWithRST(para);
-               List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
-
-
-               for(List<ParseTreeNode> sentence: nodeList){
-                       System.out.println("   Processing sentence: "+ 
sentence);
-                       boolean bInsideNER = false; 
-                       String currentPhrase = "";
-                       List<ParseTreeNode> currentPhraseNode = new 
ArrayList<ParseTreeNode>(); 
-                       for(ParseTreeNode word: sentence){
-                               if (isNERforPhraseExtraction(word)){
-                                       System.out.println("++Found word 
="+word + " | NER="+ word.getNe());
-                                       if (bInsideNER){
-                                               currentPhrase += " 
"+word.getWord();
-                                               currentPhraseNode.add(word);
-                                       } else {
-                                               bInsideNER=true;
-                                               currentPhrase = word.getWord();
-                                               currentPhraseNode.add(word);
-                                       }
-                               } else {
-                                       if (bInsideNER){
-                                               if (currentPhrase.indexOf(' 
')>-1) // at least two tokens
-                                                       
extractedNERsWords.add(currentPhrase);
-                                                       
extractedNERs.add(currentPhraseNode);
-                                               currentPhrase = "";
-                                               bInsideNER=false;
-                                       } else {
-                                               // do nothing, continue scan
-                                       }
-                               }
-                       }
-                       if (currentPhrase.length()>1 && currentPhrase.indexOf(' 
')>-1){
-                               extractedNERs.add(currentPhraseNode);
-                               extractedNERsWords.add(currentPhrase);
-                       }
-
-                       Set<String> foundSentimentWords = new HashSet<String>();
-                       // now we extract phrases
-                       List<List<ParseTreeNode>> phrases = pt.getPhrases();
-                       for(List<ParseTreeNode> phrase: phrases){
-                               // find a noun phrase under sentiment
-                               try {
-                                       for(int i = phrase.size()-1; i>-1; i--){
-                                               ParseTreeNode word = 
phrase.get(i);
-                                               if 
((isSentimentWord(word.getWord()) ||
-                                                               
sVocab.isSentimentWord(word.getWord()) && 
!foundSentimentWords.contains(word.getWord()) )){
-                                                       
foundSentimentWords.add(word.getWord());
-                                                       
System.out.println("Found opinionated phrase "+phrase.toString());
-                                                       
extractedSentimentPhrases.add(phrase);                  
-                                                       break;
-                                               }
-                                       }
-                               } catch (Exception e) {
-                                       e.printStackTrace();
-                               }
-                       }
-
-               } 
-               result.setExtractedNER(extractedNERs);
-               result.setExtractedNERWords(extractedNERsWords);
-               result.setExtractedSentimentPhrases(extractedSentimentPhrases);
-               return result;
-       }
-
-
-
-       private boolean isNERforPhraseExtraction(ParseTreeNode word){
-               if ((word.getNe().equals("ORGANIZATION") 
||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
-                               (word.getPos().startsWith("NN") || 
word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
-                                               word.getPos().startsWith("JJ") 
|| word.getPos().startsWith("DT")  ))
-                       return true;
-
-               return false;
-
-       }
-
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
deleted file mode 100644
index a138de6..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.lang.StringUtils;
-
-import opennlp.tools.similarity.apps.utils.PageFetcher;
-
-public class YouTubeMiner {
-       private PageFetcher fetcher = new PageFetcher();
-       public YouTubeMinerResult getData(String url){
-               YouTubeMinerResult result = new YouTubeMinerResult();
-               String content = fetcher.fetchOrigHTML(url);
-               try {
-                       FileUtils.writeStringToFile(new File(url.replace(':', 
'_').replace('/', '_')), content);
-               } catch (IOException e1) {
-                       // TODO Auto-generated catch block
-                       e1.printStackTrace();
-               }
-               if (url.indexOf("channel")>-1){
-                       try { //subscriber-count" title="30" 
-                               String subscribersStr = 
StringUtils.substringBetween(content,"subscriber-count", "tabindex");
-                               String dirtyNumber = 
StringUtils.substringBetween(subscribersStr, "title=\"", "\"");
-                               String cleanNumber = 
dirtyNumber.replaceAll("[^\\x00-\\x7F]", "");
-                               if (cleanNumber!=null){
-                                       int subscribers = 
Integer.parseInt(cleanNumber );
-                                       result.subscribers = subscribers;
-                               } else {
-                                       System.err.println("Not found data for 
'subscriber-count', 'tabindex'");
-                               }
-                       } catch (NumberFormatException e) {
-                               // TODO Auto-generated catch block
-                               e.printStackTrace();
-                       }
-               } else {
-                       try {
-
-                               String subscribersStr = 
StringUtils.substringBetween(content,"subscriber-count", "tabindex");
-                               String dirtyNumber = 
StringUtils.substringBetween(subscribersStr, "title=\"", "\"").replace(" ", "");
-                               if (dirtyNumber!=null){
-                                       int subscribers = 
Integer.parseInt(dirtyNumber );
-                                       result.subscribers = subscribers;
-                               } else {
-                                       System.err.println("Not found data for 
'subscriber-count', 'tabindex'");
-                               }
-
-                               String viewsStrDirty = 
StringUtils.substringBetween(content,
-                                               //"div 
class=\"watch-view-count\">"," views</div>");
-                                               //view-count">12 
Ð¿ÑÐ¾ÑÐ¼Ð¾ÑÑÐ¾Ð²</div>
-                                               "view-count","<div>");
-                               String viewsStr = 
StringUtils.substringBetween(viewsStrDirty,">", " ");
-                               if (viewsStr!=null){
-                                       int views = Integer.parseInt(viewsStr );
-                                       result.views = views;
-                               } else {
-                                       System.err.println("Not found data for 
'view-count','<div>'");
-                               }
-                       } catch (NumberFormatException e) {
-                               // TODO Auto-generated catch block
-                               e.printStackTrace();
-                       }
-               }
-
-               return result;
-       }
-
-
-
-
-       public static void main(String[] args){
-               YouTubeMiner  miner = new YouTubeMiner();
-               
System.out.println(miner.getData("https://www.youtube.com/channel/UC-maQbG5eUS5c1wmaTnLwTA";));
-               
System.out.println(miner.getData("https://www.youtube.com/watch?v=U6X4VT9dVr8";));
-               
System.out.println(miner.getData("https://www.youtube.com/watch?v=kH-AQnta714";));
-               
System.out.println(miner.getData("https://www.youtube.com/watch?v=pWb50Kn1ShQ";));
-       }
-}
-
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
deleted file mode 100644
index 86c8e9d..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.opinion_processor;
-
-public class YouTubeMinerResult {
-       public int likes;
-       public int subscribers;
-       public int views;
-       
-       boolean isPromisingYoungIndividual(){
-               if (subscribers>0)
-                       if (subscribers>10 && subscribers< 20000)
-                               return true;
-               if (views>0)
-                       if (views>10 && views< 20000)
-                               return true;
-               return false;
-
-       }
-       
-       public String toString(){
-               return "views :"+ views + "| subscribers = "+ subscribers;
-       }
-}

[07/11] opennlp-sandbox git commit: removed stanford nlp refs

Reply via email to