[45/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

bgalitsky Wed, 16 Nov 2016 01:11:37 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
new file mode 100644
index 0000000..b766c7c
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import 
opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+public class NamedEntityExtractor {
+       protected static Matcher matcher;
+       private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
+       protected ArrayList<File> queue = new ArrayList<File>();
+       protected static PT2ThicketPhraseBuilder phraseBuilder;
+       protected static SentimentVocab sVocab = SentimentVocab.getInstance();
+       String resourceDirSentimentList = null;
+       Set<String> sentimentVcb = new HashSet<String> ();
+
+       static {
+               synchronized (NamedEntityExtractor.class) {
+                       matcher = new Matcher();
+                       phraseBuilder = new PT2ThicketPhraseBuilder();
+               }
+       }
+
+       public NamedEntityExtractor(){
+               try {
+                       resourceDirSentimentList = new File( "." 
).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+               List<String[]> sentimentList=null;
+               sentimentList = 
ProfileReaderWriter.readProfiles(resourceDirSentimentList);
+               for(String[] line: sentimentList){
+                       sentimentVcb.add(line[0]);
+               }
+       }
+
+       protected boolean isSentimentWord(String word){
+               if (sentimentVcb.contains(word))
+                       return true;
+               else
+                       return false;           
+       }
+
+       public EntityExtractionResult extractEntities(String para){
+               List<List<ParseTreeNode>> extractedNERs = new 
ArrayList<List<ParseTreeNode>>();
+               List<String> extractedNERsWords = new ArrayList<String>();
+               List<List<ParseTreeNode>> extractedSentimentPhrases = 
+                               new ArrayList<List<ParseTreeNode>>();
+               EntityExtractionResult result = new EntityExtractionResult();
+
+               ParseThicket pt = null;
+
+               System.out.println("Processing paragraph of length 
"+para.length() + " | "+ para);
+               pt = matcher.buildParseThicketFromTextWithRST(para);
+               List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
+
+
+               for(List<ParseTreeNode> sentence: nodeList){
+                       //System.out.println("   Processing sentence: "+ 
sentence);
+                       boolean bInsideNER = false; 
+                       String currentPhrase = "";
+                       List<ParseTreeNode> currentPhraseNode = new 
ArrayList<ParseTreeNode>(); 
+                       for(ParseTreeNode word: sentence){
+                               if (isNERforPhraseExtraction(word)){
+                                       //System.out.println("++Found word 
="+word + " | NER="+ word.getNe());
+                                       if (bInsideNER){
+                                               currentPhrase += " 
"+word.getWord();
+                                               currentPhraseNode.add(word);
+                                       } else {
+                                               bInsideNER=true;
+                                               currentPhrase = word.getWord();
+                                               currentPhraseNode.add(word);
+                                       }
+                               } else {
+                                       if (bInsideNER){
+                                               if (currentPhrase.indexOf(' 
')>-1) // at least two tokens
+                                                       
extractedNERsWords.add(currentPhrase);
+                                                       
extractedNERs.add(currentPhraseNode);
+                                               currentPhrase = "";
+                                               bInsideNER=false;
+                                       } else {
+                                               // do nothing, continue scan
+                                       }
+                               }
+                       }
+                       if (currentPhrase.length()>1 && currentPhrase.indexOf(' 
')>-1){
+                               extractedNERs.add(currentPhraseNode);
+                               extractedNERsWords.add(currentPhrase);
+                       }
+
+                       Set<String> foundSentimentWords = new HashSet<String>();
+                       // now we extract phrases
+                       List<List<ParseTreeNode>> phrases = pt.getPhrases();
+                       for(List<ParseTreeNode> phrase: phrases){
+                               // find a noun phrase under sentiment
+                               try {
+                                       for(int i = phrase.size()-1; i>-1; i--){
+                                               ParseTreeNode word = 
phrase.get(i);
+                                               if 
((isSentimentWord(word.getWord()) ||
+                                                               
sVocab.isSentimentWord(word.getWord()) && 
!foundSentimentWords.contains(word.getWord()) )){
+                                                       
foundSentimentWords.add(word.getWord());
+                                                       
System.out.println("Sentim = " + word.getWord() + " | Found opinionated phrase 
"+phrase.toString());
+                                                       if (phrase.size()>1 && 
phrase.size()<7)
+                                                               
extractedSentimentPhrases.add(phrase);                  
+                                                       break;
+                                               }
+                                       }
+                               } catch (Exception e) {
+                                       e.printStackTrace();
+                               }
+                       }
+
+               } 
+               
+               extractedSentimentPhrases = 
reduceExtractedPhrases(extractedSentimentPhrases);
+               
+               result.setExtractedNER(extractedNERs);
+               result.setExtractedNERWords(extractedNERsWords);
+               result.setExtractedSentimentPhrases(extractedSentimentPhrases);
+               return result;
+       }
+
+       private List<List<ParseTreeNode>> 
reduceExtractedPhrases(List<List<ParseTreeNode>> extractedSentimentPhrases) {
+           List<Integer> idsToDelete = new ArrayList<Integer>();
+               for(int i = 0; i<extractedSentimentPhrases.size(); i++){
+                       for(int j = i+1; j<extractedSentimentPhrases.size(); 
j++){
+                               String phrStr1 = 
ParseTreeNode.toWordString(extractedSentimentPhrases.get(i));
+                               String phrStr2 = 
ParseTreeNode.toWordString(extractedSentimentPhrases.get(j));
+                               if (phrStr1 .indexOf(phrStr2 )>-1)
+                                       idsToDelete.add(j);
+                       }
+               }
+               List<List<ParseTreeNode>> resultPhrases = new 
ArrayList<List<ParseTreeNode>>();
+               for(int i = 0; i<extractedSentimentPhrases.size(); i++){
+                       if (!idsToDelete.contains(i))
+                               resultPhrases 
.add(extractedSentimentPhrases.get(i));
+               }
+           return resultPhrases ;
+    }
+
+       private boolean isNERforPhraseExtraction(ParseTreeNode word){
+               if ((word.getNe().equals("ORGANIZATION") 
||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
+                               (word.getPos().startsWith("NN") || 
word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
+                                               word.getPos().startsWith("JJ") 
|| word.getPos().startsWith("DT")  ))
+                       return true;
+
+               return false;
+
+       }
+
+
+}


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
new file mode 100644
index 0000000..cb04154
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/PersonExtractor.java
@@ -0,0 +1,96 @@
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+
+public class PersonExtractor extends NamedEntityExtractor {
+       private boolean isNERforPhraseExtraction(ParseTreeNode word){
+               if ((word.getNe().equals("PERSON") ) &&
+                               (word.getPos().startsWith("NN") || 
word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
+                                               word.getPos().startsWith("JJ") 
|| word.getPos().startsWith("DT")  ))
+                       return true;
+
+               return false;
+
+       }
+       
+       public EntityExtractionResult extractEntities(String para){
+               List<List<ParseTreeNode>> extractedNERs = new 
ArrayList<List<ParseTreeNode>>();
+               List<String> extractedNERsWords = new ArrayList<String>();
+               List<List<ParseTreeNode>> extractedSentimentPhrases = 
+                               new ArrayList<List<ParseTreeNode>>();
+               EntityExtractionResult result = new EntityExtractionResult();
+
+               ParseThicket pt = null;
+
+               System.out.println("Processing paragraph of length 
"+para.length() + " | "+ para);
+               pt = matcher.buildParseThicketFromTextWithRST(para);
+               List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
+
+
+               for(List<ParseTreeNode> sentence: nodeList){
+                       System.out.println("   Processing sentence: "+ 
sentence);
+                       boolean bInsideNER = false; 
+                       String currentPhrase = "";
+                       List<ParseTreeNode> currentPhraseNode = new 
ArrayList<ParseTreeNode>(); 
+                       for(ParseTreeNode word: sentence){
+                               if (isNERforPhraseExtraction(word)){
+                                       System.out.println("++Found word 
="+word + " | NER="+ word.getNe());
+                                       if (bInsideNER){
+                                               currentPhrase += " 
"+word.getWord();
+                                               currentPhraseNode.add(word);
+                                       } else {
+                                               bInsideNER=true;
+                                               currentPhrase = word.getWord();
+                                               currentPhraseNode.add(word);
+                                       }
+                               } else {
+                                       if (bInsideNER){
+                                               if (currentPhrase.indexOf(' 
')>-1) // at least two tokens
+                                                       
extractedNERsWords.add(currentPhrase);
+                                                       
extractedNERs.add(currentPhraseNode);
+                                               currentPhrase = "";
+                                               bInsideNER=false;
+                                       } else {
+                                               // do nothing, continue scan
+                                       }
+                               }
+                       }
+                       if (currentPhrase.length()>1 && currentPhrase.indexOf(' 
')>-1){
+                               extractedNERs.add(currentPhraseNode);
+                               extractedNERsWords.add(currentPhrase);
+                       }
+
+                       Set<String> foundSentimentWords = new HashSet<String>();
+                       // now we extract phrases
+                       List<List<ParseTreeNode>> phrases = 
phraseBuilder.buildPT2ptPhrases(pt);
+                       for(List<ParseTreeNode> phrase: phrases){
+                               // find a noun phrase under sentiment
+                               try {
+                                       for(int i = phrase.size()-1; i>-1; i--){
+                                               ParseTreeNode word = 
phrase.get(i);
+                                               if 
((isSentimentWord(word.getWord()) ||
+                                                               
sVocab.isSentimentWord(word.getWord()) && 
!foundSentimentWords.contains(word.getWord()) )){
+                                                       
foundSentimentWords.add(word.getWord());
+                                                       
System.out.println("Found opinionated phrase "+phrase.toString());
+                                                       
extractedSentimentPhrases.add(phrase);                  
+                                                       break;
+                                               }
+                                       }
+                               } catch (Exception e) {
+                                       e.printStackTrace();
+                               }
+                       }
+
+               } 
+               result.setExtractedNER(extractedNERs);
+               result.setExtractedNERWords(extractedNERsWords);
+               result.setExtractedSentimentPhrases(extractedSentimentPhrases);
+               return result;
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
new file mode 100644
index 0000000..86cd2dc
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentencePhraseGivenAWordGetter.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import 
opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+public class SentencePhraseGivenAWordGetter {
+       protected static Matcher matcher;
+       protected ArrayList<File> queue = new ArrayList<File>();
+       protected static PT2ThicketPhraseBuilder phraseBuilder;
+
+
+       static {
+               synchronized (SentencePhraseGivenAWordGetter.class) {
+                       matcher = new Matcher();
+                       phraseBuilder = new PT2ThicketPhraseBuilder();
+               }
+       }
+
+       public SentencePhraseGivenAWordGetter(){
+       }
+
+       public EntityExtractionResult extractEntities(String para, String 
keyword){
+               List<List<ParseTreeNode>> extractedPhrases = new 
ArrayList<List<ParseTreeNode>>();
+
+               EntityExtractionResult result = new EntityExtractionResult();
+
+               ParseThicket pt =  
matcher.buildParseThicketFromTextWithRST(para);
+
+               List<List<ParseTreeNode>> phrases = pt.getPhrases();
+               for(List<ParseTreeNode> phrase: phrases){
+                       // find a noun phrase under sentiment
+                       try {
+                               for(int i = 0; i<phrase.size(); i++){
+                                       ParseTreeNode word = phrase.get(i);
+                                       if 
(word.getWord().toLowerCase().equals(keyword.toLowerCase())){
+                                               extractedPhrases.add(phrase);   
        
+                                               break;
+                                       }
+                               }
+                       } catch (Exception e) {
+                               e.printStackTrace();
+                       }
+               }
+
+               result.setExtractedSentimentPhrases(extractedPhrases);
+               return result;
+       }
+
+
+       public static void main(String[] args){
+               SentencePhraseGivenAWordGetter self = new 
SentencePhraseGivenAWordGetter();
+               EntityExtractionResult result = self.extractEntities("However i 
put a foam panel inside the main case if i do not have my headphones or an iPad 
to brace the mac book", 
+                               "panel");
+               System.out.println(result.getExtractedSentimentPhrases());
+       }
+}
+
+
+/*
+ 3 phrases are given as a result
+ * 
+[[<2>SBAR'i':FW, <3>SBAR'put':VBD, <4>SBAR'a':DT, <5>SBAR'foam':NN, 
<6>SBAR'panel':NN, <7>SBAR'inside':IN, <8>SBAR'the':DT, <9>SBAR'main':JJ, 
<10>SBAR'case':NN, <11>SBAR'if':IN, <12>SBAR'i':FW, 
+<13>SBAR'do':VBP, <14>SBAR'not':RB, <15>SBAR'have':VB, <16>SBAR'my':PRP$, 
<17>SBAR'headphones':NNS, <18>SBAR'or':CC, <19>SBAR'an':DT, <20>SBAR'iPad':NN, 
<21>SBAR'to':TO, 
+<22>SBAR'brace':VB, <23>SBAR'the':DT, <24>SBAR'mac':NN, <25>SBAR'book':NN], 
+
+[<3>VP'put':VBD, <4>VP'a':DT, <5>VP'foam':NN, <6>VP'panel':NN, 
<7>VP'inside':IN, <8>VP'the':DT, <9>VP'main':JJ, <10>VP'case':NN, 
<11>VP'if':IN, <12>VP'i':FW, <13>VP'do':VBP, 
+<14>VP'not':RB, <15>VP'have':VB, <16>VP'my':PRP$, <17>VP'headphones':NNS, 
<18>VP'or':CC, <19>VP'an':DT, <20>VP'iPad':NN, <21>VP'to':TO, <22>VP'brace':VB, 
<23>VP'the':DT, 
+<24>VP'mac':NN, <25>VP'book':NN], 
+
+[<4>NP'a':DT, <5>NP'foam':NN, <6>NP'panel':NN]]
+
+*/

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
new file mode 100644
index 0000000..1efe428
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/SentimentCoreAnnotations.java
@@ -0,0 +1,41 @@
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import edu.stanford.nlp.ling.CoreAnnotation;
+
+import edu.stanford.nlp.trees.Tree;
+
+/**
+ * Annotations specific to the Sentiment project.  In case there are
+ * other projects that use the same RNN machinery, including the RNN
+ * core annotations, this lets a sentence have a tree attached where
+ * that tree specifically has the sentiment annotations.
+ *
+ * @author John Bauer
+ */
+public class SentimentCoreAnnotations {
+
+  /**
+   * A tree which contains the annotations used for the Sentiment
+   * task.  After forwardPropagate has been called, the Tree will have
+   * prediction, etc. attached to it.
+   */
+  public static class SentimentAnnotatedTree implements CoreAnnotation<Tree> {
+    @Override
+    public Class<Tree> getType() {
+      return Tree.class;
+    }
+  }
+
+
+  /**
+   * The final label given for a sentence.  Set by the
+   * SentimentAnnotator and used by various forms of text output.
+   */
+  public static class SentimentClass implements CoreAnnotation<String> {
+    @Override
+    public Class<String> getType() {
+      return String.class;
+    }
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
new file mode 100755
index 0000000..ad0f791
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/StopList.java
@@ -0,0 +1,401 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.stemmer.PStemmer;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+
+public class StopList {
+    private static StopList m_StopList = null;
+    private static Hashtable<String, HashSet<String>> m_stopHash = new 
Hashtable<String, HashSet<String>>();
+    public static final Log logger = LogFactory.getLog(StopList.class);
+    private static final String DEFAULT_STOPLIST = "STANDARD";
+    public static String resourceDir =null;
+    private static PStemmer stemmer = new PStemmer();
+
+    static {
+        synchronized (StopList.class) {
+            try {
+                LoadStopList();
+            } catch (IOException e) {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
+        }
+    }
+
+    /**
+     * Get the StopList singleton instance.
+     * 
+     * @return The StopList
+     */
+    static public synchronized StopList getInstance() {
+
+        if (m_StopList == null) {
+            m_StopList = new StopList();
+
+            try {
+                m_StopList.LoadStopList();
+            } catch (Exception e) {
+
+            }
+        }
+        return m_StopList;
+    }
+
+    static public synchronized StopList getInstance(String dir) {
+        resourceDir = dir;
+        if (m_StopList == null) {
+            m_StopList = new StopList();
+
+            try {
+                m_StopList.LoadStopList();
+            } catch (Exception e) {
+
+            }
+        }
+        return m_StopList;
+    }
+
+    private static void LoadStopList() throws IOException {
+
+        File dir = new File(resourceDir + "/maps");
+        String[] children = dir.list();
+        if (children == null) {
+            System.err.println("Problem reading Stop Lists!");
+        } else {
+            for (int i = 0; i < children.length; i++) {
+                String fn = children[i];
+                if (fn.endsWith(".vcb")) {
+                    String fileName = resourceDir + "/maps/" + fn;
+                    File f = new File(fileName);
+                    loadStopListFile(f);
+                }
+            }
+        }
+    }
+
+    private static void loadStopListFile(File f) throws FileNotFoundException {
+
+        FileReader fileReader = new FileReader(f);
+        BufferedReader in = new BufferedReader(fileReader);
+
+        String str = new String();
+        boolean fLine = true;
+        HashSet<String> t = new HashSet<String>();
+        String listName = "";
+
+        try {
+            while ((str = in.readLine()) != null) {
+                if (fLine && str.length() > 0) {
+                    fLine = false;
+                    listName = str;
+                } else {
+                    t.add(str);
+                }
+            }
+        } catch (IOException ioe) {
+
+        } finally {
+            try {
+                if (in != null) {
+                    in.close();
+                }
+                if (fileReader != null) {
+                    fileReader.close();
+                }
+            } catch (IOException ioe) {
+                ioe.printStackTrace();
+            }
+        }
+
+        if (listName.length() > 0) {
+            HashSet<String> l = m_stopHash.get(listName);
+            if (l != null) {
+                synchronized (l) {
+                    m_stopHash.put(listName, t);
+                }
+            } else {
+                m_stopHash.put(listName, t);
+            }
+        }
+    }
+
+    /**
+     * Is the given word in the stop words list? Uses the defaut "STANDARD"
+     * stoplist
+     * 
+     * @param str
+     *            The word to check
+     * @return is a stop word
+     */
+    public static boolean isStopWord(String str) {
+        boolean retVal = false;
+        if (m_stopHash.containsKey(DEFAULT_STOPLIST))
+            retVal = m_stopHash.get(DEFAULT_STOPLIST).contains(str);
+        return retVal;
+    }
+
+    public static boolean isFirstName(String str) {
+        boolean retVal = false;
+        if (m_stopHash.containsKey("FIRST_NAMES"))
+            retVal = m_stopHash.get("FIRST_NAMES").contains(str.toUpperCase());
+        return retVal;
+    }
+
+    public String getRandomFirstName() {
+        HashSet<String> firstNames = m_stopHash.get("FIRST_NAMES");
+        int indexRand = (int) (Math.random() * new Float(firstNames.size()));
+        Iterator iter = firstNames.iterator();
+        for (int i = 0; i < indexRand; i++) {
+            iter.next();
+        }
+        return ((String) iter.next()).toLowerCase();
+    }
+
+    public static boolean isCommonWord(String str) {
+        if (str == null)
+            return true;
+        String stemmed="";
+               try {
+                       stemmed = stemmer.stem(str).toLowerCase();
+               } catch (Exception e) {
+                       //stemming exceptions are not informative, jiust ignore 
wthis word
+                       //e.printStackTrace();
+               }
+
+        boolean retVal = false;
+        if (m_stopHash.containsKey("ENG_DICT"))
+            retVal = m_stopHash.get("ENG_DICT").contains(stemmed);
+        return retVal;
+    }
+
+    public boolean isCommonEventWord(String str) {
+        if (str == null)
+            return true;
+        boolean retVal = false;
+
+        try {
+            String stemmed = str.toLowerCase();
+
+            if (m_stopHash.containsKey("fREQUENTEVENTNAMEWORDS"))
+                retVal = m_stopHash.get("fREQUENTEVENTNAMEWORDS").contains(
+                        stemmed);
+        } catch (Exception e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+        return retVal;
+    }
+
+    /**
+     * Is the given word in the stop words list provided?
+     * 
+     * @param str
+     *            The word to check
+     * @param stop_list
+     *            the name of the stoplist to check against
+     * @return is a stop word
+     */
+    public static boolean isStopWord(String str, String stop_list) {
+        boolean retVal = false;
+        if (m_stopHash.containsKey(stop_list))
+            retVal = m_stopHash.get(stop_list).contains(str);
+        return retVal;
+    }
+
+    public boolean isStopWordAll(String str) {
+        return isStopWord(str);
+    }
+
+    public HashSet<String> getStopListMap(String name) {
+        return m_stopHash.get(name);
+    }
+
+    public static List<List<String>> preFilterCommonEnglishExpressions(
+            List<String> userLikes) {
+        List<List<String>> results = new ArrayList<List<String>>();
+
+        List<String> resultUserLikes = new ArrayList<String>(), 
potentialCategs = new ArrayList<String>();
+        if (userLikes.size() < 6) {// too short, do not filter
+            results.add(userLikes);
+            results.add(potentialCategs);
+            return results;
+
+        }
+
+        for (String like : userLikes) {
+            like = like.toLowerCase();
+            if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
+                logger.info("removed isAlphanumeric " + like);
+                continue;
+            }
+
+            if (StringUtils.isNumeric(like)) {
+                logger.info("removed isNumericSpace " + like);
+                continue;
+            }
+
+            if (like.length() < 4) {
+                logger.info("removed too short likes " + like);
+                continue;
+            }
+            boolean existFirstName = false, allWordsCommonEnglish = true, 
bStop = false;
+            String[] comps = like.split(" ");
+            StringBuffer buf = new StringBuffer();
+            for (String word : comps) {
+                boolean isCommon = isCommonWord(word);
+                boolean isName = isFirstName(word);
+                if (!isCommon)
+                    allWordsCommonEnglish = false;
+                if (isName)
+                    existFirstName = true;
+                if (isStopWord(word) || word.length() < 3)
+                    bStop = true;
+                else
+                    buf.append(word + " ");
+            } // / does not have to include stop word
+            if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
+                logger.info("moved to category:  
NoFirstName+AllCommonEng+ShorterThan3 "
+                        + like);
+
+                continue;
+            }
+            if (!existFirstName && allWordsCommonEnglish && comps.length == 1) 
{
+                logger.info("moved to category: 
NoFirstName+AllCommonEng+Short1word "
+                        + like);
+                potentialCategs.add(like);
+                continue;
+            }
+
+            if (existFirstName && comps.length == 1) {
+                logger.info("removed : only first name, no last name " + like);
+
+                continue;
+            }
+
+            resultUserLikes.add(buf.toString().trim());
+
+        }
+
+        resultUserLikes = new ArrayList<String>(new HashSet<String>(
+                resultUserLikes));
+        if (resultUserLikes.size() > 1) {
+            results.add(resultUserLikes);
+            results.add(potentialCategs);
+            return results;
+        }
+
+        else {// do not do reduction
+            results.add(userLikes);
+            results.add(potentialCategs);
+            return results;
+        }
+    }
+
+    public static boolean isAcceptableIndividualLikes(String like) {
+        StopList finder = StopList.getInstance();
+        like = like.toLowerCase();
+        if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
+            logger.info("removed isAlphanumeric " + like);
+            return false;
+        }
+
+        if (StringUtils.isNumeric(like)) {
+            logger.info("removed isNumericSpace " + like);
+            return false;
+        }
+
+        if (like.length() < 4) {
+            logger.info("removed too short likes " + like);
+            return false;
+        }
+        boolean existFirstName = false, allWordsCommonEnglish = true, bStop = 
false;
+        String[] comps = like.split(" ");
+        StringBuffer buf = new StringBuffer();
+        for (String word : comps) {
+            boolean isCommon = finder.isCommonWord(word);
+            boolean isName = finder.isFirstName(word);
+            if (!isCommon)
+                allWordsCommonEnglish = false;
+            if (isName)
+                existFirstName = true;
+            if (finder.isStopWord(word) || word.length() < 3)
+                bStop = true;
+            else
+                buf.append(word + " ");
+        } // / does not have to include stop word
+        if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
+            logger.info("  NoFirstName+AllCommonEng+ShorterThan3 " + like);
+
+            return false;
+        }
+        if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
+            logger.info(" NoFirstName+AllCommonEng+Short1word " + like);
+
+            return false;
+        }
+
+        if (existFirstName && comps.length == 1) {
+            logger.info("removed : only first name, no last name " + like);
+
+            return false;
+        }
+
+        return true;
+    }
+
+    @SuppressWarnings("all")
+    public static void main(String[] args) {
+
+        StopList list = StopList
+                
.getInstance("/Users/borisgalitsky/Documents/workspace/opennlp-similarity/src/test/resources/");
+        Boolean b = list.isCommonWord("demonstration");
+
+        String fname = list.getRandomFirstName();
+
+        b = list.isCommonEventWord("tour");
+        b = list.isCommonEventWord("dance");
+        b = list.isCommonEventWord("salsa");
+        b = list.isCommonEventWord("center");
+        b = list.isCommonEventWord("family");
+
+      
+
+        b = isAcceptableIndividualLikes("forest glen");
+        b = isAcceptableIndividualLikes("drive");
+        b = isAcceptableIndividualLikes("house");
+        b = isAcceptableIndividualLikes("Timothy Kloug");
+        b = isAcceptableIndividualLikes("Mamma Mia");
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
new file mode 100644
index 0000000..f4d56aa
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicAsOpinionMinerRunner.java
@@ -0,0 +1,117 @@
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class TopicAsOpinionMinerRunner {
+       private List<File> queue;
+       private final static String reviewSource = 
"/Users/bgalitsky/Documents/solr/example/exampledocs/publication_page0.json";
+       NamedEntityExtractor neExtractor = new NamedEntityExtractor();
+       Set<String> allPhrases = new HashSet<String>();
+       
+       public void processJSONfileWithReviews(){
+               List<String[]> report = new ArrayList<String[]>();
+               report.add(new String[] { "text", "phrases of potential 
interest list" , });
+
+               
+               String content=null;
+               try {
+                       content = FileUtils.readFileToString(new 
File(reviewSource));
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+               String[] texts = StringUtils.substringsBetween(content, 
"summary\":\"", "\"");
+               for(String text: texts){
+                       report.clear();
+                       EntityExtractionResult result = 
neExtractor.extractEntities(text);
+                       //report.add(new String[]{text});
+                       allPhrases.addAll(result.extractedNERWords);
+                       allPhrases = new HashSet<String>(allPhrases);
+                       for(String p: allPhrases){
+                               report.add(new String[]{p});
+                       }
+                       /*
+                       String[] phrases = 
(String[])result.extractedNERWords.toArray(new String[0]);
+                       if (phrases!=null && phrases.length>0)
+                               report.add(phrases);
+                       */
+                       
/*report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
+                       List<String> stringPhrases = new ArrayList<String>(),
+                                       nodePhrases = new ArrayList<String>();
+                       for(List<ParseTreeNode> chList: 
result.extractedSentimentPhrases){
+                               String buf = "", nodeBuf="";
+                               for(ParseTreeNode ch: chList){
+                                       buf+=ch.getWord()+ " ";
+                                       nodeBuf+=ch.toString()+ " ";
+                               }
+                               stringPhrases.add(buf.trim());
+                               nodePhrases.add(nodeBuf.trim());
+                       }
+                       report.add((String[])stringPhrases.toArray(new 
String[0]));
+                       report.add((String[])nodePhrases.toArray(new 
String[0]));
+                       */
+                       
+                       ProfileReaderWriter.writeReport(report, 
"phrasesExtracted3.csv");
+               }
+       }
+
+       private void addFiles(File file) {
+
+               if (!file.exists()) {
+                       System.out.println(file + " does not exist.");
+
+                       if (file.isDirectory()) {
+                               for (File f : file.listFiles()) {
+                                       if (f.getName().startsWith("."))
+                                               continue;
+                                       addFiles(f);
+                                       System.out.println(f.getName());
+                               }
+                       } else {
+                               queue.add(file);
+
+                       }
+               }
+       }
+       
+       public static void main(String[] args){
+               TopicAsOpinionMinerRunner runner = new 
TopicAsOpinionMinerRunner();
+               runner.processJSONfileWithReviews();
+
+       }
+}
+
+/*
+       public void processDirectory(String path){
+               List<String[]> report = new ArrayList<String[]>();
+               report.add(new String[] { "filename", "named entity list", 
"phrases of potential interest list" });
+
+               List<String> allNamedEntities = new ArrayList<String>();
+
+               addFiles(new File(path));
+               for(File f: queue){
+                       List<String> entities = (List<String>) 
extractEntities(f.getAbsolutePath()).getFirst();
+                       List<String> opinions = (List<String>) 
extractEntities(f.getAbsolutePath()).getSecond();
+                       report.add(new String[]{ f.getName(), 
entities.toString(),  opinions.toString()});      
+                       ProfileReaderWriter.writeReport(report, 
"nameEntitiesExtracted.csv");
+
+                       allNamedEntities.addAll(entities);
+
+                       allNamedEntities = new ArrayList<String>(new 
HashSet<String> (allNamedEntities ));
+
+
+               }
+               ProfileReaderWriter.writeReport(report, 
"nameEntitiesTopicsOfInterestExtracted.csv");
+       } 
+} */

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
new file mode 100644
index 0000000..a704f22
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TopicPhraseExtractor.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.Matcher;
+
+public class TopicPhraseExtractor {
+       Matcher matcher = new Matcher();
+
+       // sentiment vocabulary for phrase under the focus of sentiment
+       SentimentVocab sVocab = SentimentVocab.getInstance();
+       //This is used to create an XML with phrases. The same class for acro  
& phrases
+
+       public EntityExtractionResult extractEntities(String para){
+               EntityExtractionResult result = new EntityExtractionResult();
+               List<String> extractedNerPhrasesStr = new ArrayList<String>(), 
+                               extractedNerExactStr = new ArrayList<String>(),
+                               extractedSentimentPhrasesStr = 
+                               new ArrayList<String>(), 
extractedNONSentimentPhrasesStr = 
+                               new ArrayList<String>(), extractedNerPhraseTags 
= new ArrayList<String>();
+               // no need to change to extract more/less phrases
+               ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(para);
+
+               List<List<ParseTreeNode>> extractedSentimentPhrases = new 
ArrayList<List<ParseTreeNode>>(), 
+                               extractedNONSentimentPhrases = new 
ArrayList<List<ParseTreeNode>>(),
+                               extractedNerPhrases = new 
ArrayList<List<ParseTreeNode>>(),
+                                               extractedNerExactPhrases= new 
ArrayList<List<ParseTreeNode>>();
+               //TODO document examples / cases for each rule
+               // now we extract phrases
+               List<List<ParseTreeNode>> phrases = pt.getPhrases();
+               List<Float> sentimentProfile = pt.getSentimentProfile();
+               for(List<ParseTreeNode> phrase: phrases){
+
+                       // find a noun phrase under sentiment
+                       boolean bAccept = true, bNER = false;
+
+                       String phraseStr = asString(phrase);
+
+
+                       if (!phrase.get(0).getPhraseType().equals("NP") && 
!phrase.get(0).getPhraseType().equals("VP") )        
+                               bAccept = false;
+
+                       boolean bSentiment = false;
+                       for(ParseTreeNode word: phrase){
+                               if (sVocab.isSentimentWord(word.getWord())){
+                                       bSentiment=true;
+                                       break;
+                               }
+                       }
+
+                       String nerTagConfirmed = null;
+                       for(ParseTreeNode word: phrase){
+                               // no Named Entity
+                               String nerTag = isNERforPhraseExtraction(word);
+                               if (nerTag!=null){
+                                       bNER = true;
+                                       nerTagConfirmed = nerTag;
+                               }
+
+                               // no numbers nor prepositions
+                               if (word.getPos().startsWith("CD") || 
word.getPos().indexOf("PRP")>-1 )
+                                       bAccept = false;
+                       }
+                       if (!bAccept)
+                               continue;
+                       // was 7 -> 2
+                       if (phrase.size()>7 || phrase.size()<2)
+                               bAccept = false;
+
+                       if (phrase.get(0).getPos().equals("DT") && 
phrase.size()<3)
+                               bAccept = false;
+                       if (!bAccept)
+                               continue;
+
+                       String cleanedPhraseStr = cleanPhraseString(phraseStr);
+                       if (cleanedPhraseStr==null)
+                               bAccept = false;
+
+                       if (bAccept){
+                               if (bNER){
+                                       extractedNerPhrases.add(phrase);
+                                       extractedNerPhrasesStr.add(phraseStr);
+                                       
extractedNerPhraseTags.add(nerTagConfirmed );
+                                       // forming exact NER
+                                       List<ParseTreeNode> phraseNER_exact = 
new ArrayList<ParseTreeNode>();
+                                       String nerExactStr = "";
+                                       for(ParseTreeNode word: phrase){
+                                               String ner = 
isNERforPhraseExtraction(word);
+                                               if (ner!=null && 
ner.equals(nerTagConfirmed)){
+                                                       
phraseNER_exact.add(word);
+                                                       nerExactStr+=" 
"+word.getWord();
+                                               }
+                                       }
+                                       nerExactStr.trim();
+                                       
extractedNerExactPhrases.add(phraseNER_exact);
+                                       extractedNerExactStr.add(nerExactStr);
+                               }
+                               else if (bSentiment) {
+                                       
extractedSentimentPhrasesStr.add(cleanedPhraseStr);                             
        
+                                       extractedSentimentPhrases.add(phrase);
+                               } else {
+                                       
extractedNONSentimentPhrasesStr.add(cleanedPhraseStr);                          
        
+                                       
extractedNONSentimentPhrases.add(phrase);
+                               }
+                       }
+               } 
+
+               result.setExtractedSentimentPhrases(extractedSentimentPhrases);
+               
result.setExtractedSentimentPhrasesStr(extractedSentimentPhrasesStr);
+
+               
result.setExtractedNONSentimentPhrases(extractedNONSentimentPhrases);
+               
result.setExtractedNONSentimentPhrasesStr(extractedNONSentimentPhrasesStr);
+               
+               result.setExtractedNerPhrases(extractedNerPhrases);
+               result.setExtractedNerPhrasesStr(extractedNerPhrasesStr);
+               result.setExtractedNerPhraseTags(extractedNerPhraseTags);
+               
+               result.setExtractedNerExactPhrases(extractedNerExactPhrases);
+               result.setExtractedNerExactStr(extractedNerExactStr);
+
+               result.setSentimentProfile(sentimentProfile );
+
+               return result;
+       }
+
+
+
+
+
+
+       private String cleanPhraseString(String phraseStr) {
+               String p = phraseStr.toLowerCase();
+
+               if (p.startsWith("*") || p.startsWith("&") || p.startsWith("$"))
+                       return null;
+
+               if (p.startsWith("this ") || p.startsWith("other "))
+                       return null;
+
+               if (p.startsWith("a "))
+                       p = p.substring(2, p.length());
+               if (p.startsWith("the "))
+                       p = p.substring(4, p.length());
+               if (p.startsWith(", "))
+                       p = p.substring(2, p.length());
+
+               return p;
+       }
+
+       private String asString(List<ParseTreeNode> phrase) {
+               String buf = "";
+               for(ParseTreeNode p: phrase)
+                       buf+=p.getWord()+" ";
+               return buf.trim();
+       }
+
+       private String isNERforPhraseExtraction(ParseTreeNode word){
+               if (word.getNe() == null)
+                       return null;
+               
+
+               if (!(word.getPos().startsWith("NN") || 
word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
+                               word.getPos().startsWith("JJ") || 
word.getPos().startsWith("DT")))
+                       return null;
+                               
+
+               if (word.getNe().equals("ORGANIZATION"))
+                               return "ORGANIZATION";
+               if(word.getNe().equals("LOCATION"))
+                       return "LOCATION";
+                                       
+               if(word.getNe().equals("PERSON") ) 
+                       return "PERSON";
+               
+               if(word.getNe().equals("MONEY") ) 
+                       return "MONEY";
+               if(word.getNe().equals("DATE") ) 
+                       return "DATE";
+               if(word.getNe().equals("TIME") ) 
+                       return "TIME";
+
+               return null;
+
+       }
+}
+
+/*
+ * NaÃ¯ve  sentiment prediction systems work just by looking at words in 
isolation, giving positive points for positive words and negative points for 
negative words and then summing up these points. That way, the order of words 
is ignored and important information is lost. The deep learning model of 
(Socher et al 2013) builds a representation of whole sentences based on the 
sentence structure. It computes the sentiment based on how words compose the 
meaning of longer phrases. However, in most applications just taking individual 
sentences into account do not give accurate results and rhetoric information 
needs to be taken into account to determine the overall sentiment of a 
paragraph and then back to the individual sentence level.
+ */
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
new file mode 100644
index 0000000..6de3180
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterEngineRunner.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.reflect.Array;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import au.com.bytecode.opencsv.CSVWriter;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class TwitterEngineRunner {
+       private List<File> queue;
+       private final static String twSource = 
"/Users/bgalitsky/Documents/workspace/TwitterMiner/data/TwitterArtistsDynamicsTot12_07.csv";
+       TwitterFilter neExtractor = new TwitterFilter();
+       private static int iWind = 80;
+
+       public void processTweetFile(int nRun){
+               List<String[]> report = new ArrayList<String[]>(), ful_less =  
new ArrayList<String[]>();
+               List<String> meaningLESS = new ArrayList<String>(), meaningFUL 
= new ArrayList<String>();
+               report.add(new String[] { "text", "phrases of potential 
interest list" , });
+
+               List<String[]> texts = 
ProfileReaderWriter.readProfiles(twSource);
+               int offset = iWind*nRun;
+               
+               //for(int i=offset; i< offset+iWind; i++){
+                       
+               //      String[] text = texts.get(i);
+               for(String[] text: texts){
+                       List<String> textDeduped = new ArrayList<String>(new 
HashSet<String>(Arrays.asList(text)));
+                       EntityExtractionResult result = null;
+                       if (text==null || text.length<4)
+                               continue;
+
+                       for(int nInLine=3; nInLine<textDeduped.size(); 
nInLine++){
+                               if (textDeduped.get(nInLine).length()>180)
+                                       continue;
+                               
+                               String cleanedTweet = 
textDeduped.get(nInLine).replace("/\\bs\\@+/ig","");
+                               try {
+                                       result = 
neExtractor.extractEntities(cleanedTweet);
+                               } catch (Exception e) {
+                                       e.printStackTrace();
+                                       continue;
+                               }
+                               report.add(new String[]{text[0],text[nInLine]});
+                               
report.add((String[])result.extractedNERWords.toArray(new String[0]));
+                               
//report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
+                               List<String> stringPhrases = new 
ArrayList<String>(),
+                                               nodePhrases = new 
ArrayList<String>();
+                               Boolean bMeaningf = false;
+
+                               //stringPhrases.add(""); nodePhrases.add(""); 
// to make report more readable
+                               for(List<ParseTreeNode> chList: 
result.extractedSentimentPhrases){
+                                       String buf = "", nodeBuf="";
+                                       for(ParseTreeNode ch: chList){
+                                               buf+=ch.getWord()+ " ";
+                                               nodeBuf+=ch.toString()+ " ";
+                                       }
+                                       stringPhrases.add(buf.trim());
+                                       nodePhrases.add(nodeBuf.trim());
+                               }
+                               // selecting MEANINGFULL
+                               if (nodePhrases.size()>1){
+                                       if 
((nodePhrases.get(0).indexOf(">VP'")>-1 || 
nodePhrases.get(0).indexOf(">NNP'")>-1) &&
+                                                       
(nodePhrases.get(1).indexOf(">VP'")>-1 || 
nodePhrases.get(1).indexOf(">NNP'")>-1)){
+                                               bMeaningf = true;
+
+                                       }
+                               }
+
+                               report.add((String[])stringPhrases.toArray(new 
String[0]));
+                               report.add((String[])nodePhrases.toArray(new 
String[0]));
+                               if (bMeaningf){
+                                       report.add(new String[]{"===", 
"MEANINGFUL tweet"});
+                                       if (!meaningFUL.contains(cleanedTweet))
+                                               meaningFUL.add(cleanedTweet);
+                               } else {
+                                       if (!meaningLESS.contains(cleanedTweet))
+                                               meaningLESS.add(cleanedTweet);
+                               }
+
+                               int count = 0;
+                               ful_less.clear();
+                               for(String less: meaningLESS ){
+                                       String fl = "";
+                                       if (count<meaningFUL.size())
+                                               fl = meaningFUL.get(count);
+                                       ful_less.add(new String[]{less, fl});
+                                       count++;
+                               }
+
+                               report.add(new 
String[]{"-----------------------------------------------------"});
+                                       ProfileReaderWriter.writeReport(report, 
"phrasesExtractedFromTweets3_"+nRun+".csv");
+                                       
ProfileReaderWriter.writeReport(ful_less, "ful_lessTweets3_"+nRun+".csv");
+                               
+                       }
+               }
+       }
+
+
+       public static void main(String[] args){
+               TwitterEngineRunner runner = new TwitterEngineRunner();
+               int nRun = Integer.parseInt(args[0]);
+               runner.processTweetFile(nRun);
+
+       }
+}
+
+/*
+       public void processDirectory(String path){
+               List<String[]> report = new ArrayList<String[]>();
+               report.add(new String[] { "filename", "named entity list", 
"phrases of potential interest list" });
+
+               List<String> allNamedEntities = new ArrayList<String>();
+
+               addFiles(new File(path));
+               for(File f: queue){
+                       List<String> entities = (List<String>) 
extractEntities(f.getAbsolutePath()).getFirst();
+                       List<String> opinions = (List<String>) 
extractEntities(f.getAbsolutePath()).getSecond();
+                       report.add(new String[]{ f.getName(), 
entities.toString(),  opinions.toString()});      
+                       ProfileReaderWriter.writeReport(report, 
"nameEntitiesExtracted.csv");
+
+                       allNamedEntities.addAll(entities);
+
+                       allNamedEntities = new ArrayList<String>(new 
HashSet<String> (allNamedEntities ));
+
+
+               }
+               ProfileReaderWriter.writeReport(report, 
"nameEntitiesTopicsOfInterestExtracted.csv");
+       } 
+} */
+
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
new file mode 100644
index 0000000..0e5053d
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/TwitterFilter.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import 
opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+public class TwitterFilter {
+       protected static Matcher matcher;
+       private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
+       protected ArrayList<File> queue = new ArrayList<File>();
+       protected static PT2ThicketPhraseBuilder phraseBuilder;
+       protected static SentimentVocab sVocab = SentimentVocab.getInstance();
+       String resourceDirSentimentList = null;
+       Set<String> sentimentVcb = new HashSet<String> ();
+
+       static {
+               synchronized (TwitterFilter.class) {
+                       matcher = new Matcher();
+                       phraseBuilder = new PT2ThicketPhraseBuilder();
+               }
+       }
+
+       public TwitterFilter(){
+               try {
+                       resourceDirSentimentList = new File( "." 
).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+               List<String[]> sentimentList=null;
+               sentimentList = 
ProfileReaderWriter.readProfiles(resourceDirSentimentList);
+               for(String[] line: sentimentList){
+                       sentimentVcb.add(line[0]);
+               }
+       }
+
+       private boolean isSentimentWord(String word){
+               if (sentimentVcb.contains(word))
+                       return true;
+               else
+                       return false;           
+       }
+
+       public EntityExtractionResult extractEntities(String para){
+               List<List<ParseTreeNode>> extractedNERs = new 
ArrayList<List<ParseTreeNode>>();
+               List<String> extractedNERsWords = new ArrayList<String>();
+               List<List<ParseTreeNode>> extractedSentimentPhrases = 
+                               new ArrayList<List<ParseTreeNode>>();
+               EntityExtractionResult result = new EntityExtractionResult();
+
+               ParseThicket pt = null;
+
+               System.out.println("Processing paragraph of length 
"+para.length() + " | "+ para);
+               pt = matcher.buildParseThicketFromTextWithRST(para);
+               List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
+
+
+               for(List<ParseTreeNode> sentence: nodeList){
+                       System.out.println("   Processing sentence: "+ 
sentence);
+                       boolean bInsideNER = false; 
+                       String currentPhrase = "";
+                       List<ParseTreeNode> currentPhraseNode = new 
ArrayList<ParseTreeNode>(); 
+                       for(ParseTreeNode word: sentence){
+                               if (isNERforPhraseExtraction(word)){
+                                       System.out.println("++Found word 
="+word + " | NER="+ word.getNe());
+                                       if (bInsideNER){
+                                               currentPhrase += " 
"+word.getWord();
+                                               currentPhraseNode.add(word);
+                                       } else {
+                                               bInsideNER=true;
+                                               currentPhrase = word.getWord();
+                                               currentPhraseNode.add(word);
+                                       }
+                               } else {
+                                       if (bInsideNER){
+                                               if (currentPhrase.indexOf(' 
')>-1) // at least two tokens
+                                                       
extractedNERsWords.add(currentPhrase);
+                                                       
extractedNERs.add(currentPhraseNode);
+                                               currentPhrase = "";
+                                               bInsideNER=false;
+                                       } else {
+                                               // do nothing, continue scan
+                                       }
+                               }
+                       }
+                       if (currentPhrase.length()>1 && currentPhrase.indexOf(' 
')>-1){
+                               extractedNERs.add(currentPhraseNode);
+                               extractedNERsWords.add(currentPhrase);
+                       }
+
+                       Set<String> foundSentimentWords = new HashSet<String>();
+                       // now we extract phrases
+                       List<List<ParseTreeNode>> phrases = pt.getPhrases();
+                       for(List<ParseTreeNode> phrase: phrases){
+                               // find a noun phrase under sentiment
+                               try {
+                                       for(int i = phrase.size()-1; i>-1; i--){
+                                               ParseTreeNode word = 
phrase.get(i);
+                                               if 
((isSentimentWord(word.getWord()) ||
+                                                               
sVocab.isSentimentWord(word.getWord()) && 
!foundSentimentWords.contains(word.getWord()) )){
+                                                       
foundSentimentWords.add(word.getWord());
+                                                       
System.out.println("Found opinionated phrase "+phrase.toString());
+                                                       
extractedSentimentPhrases.add(phrase);                  
+                                                       break;
+                                               }
+                                       }
+                               } catch (Exception e) {
+                                       e.printStackTrace();
+                               }
+                       }
+
+               } 
+               result.setExtractedNER(extractedNERs);
+               result.setExtractedNERWords(extractedNERsWords);
+               result.setExtractedSentimentPhrases(extractedSentimentPhrases);
+               return result;
+       }
+
+
+
+       private boolean isNERforPhraseExtraction(ParseTreeNode word){
+               if ((word.getNe().equals("ORGANIZATION") 
||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
+                               (word.getPos().startsWith("NN") || 
word.getPos().startsWith("PR") || word.getPos().startsWith("IN")|| 
+                                               word.getPos().startsWith("JJ") 
|| word.getPos().startsWith("DT")  ))
+                       return true;
+
+               return false;
+
+       }
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
new file mode 100644
index 0000000..a138de6
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMiner.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+
+public class YouTubeMiner {
+       private PageFetcher fetcher = new PageFetcher();
+       public YouTubeMinerResult getData(String url){
+               YouTubeMinerResult result = new YouTubeMinerResult();
+               String content = fetcher.fetchOrigHTML(url);
+               try {
+                       FileUtils.writeStringToFile(new File(url.replace(':', 
'_').replace('/', '_')), content);
+               } catch (IOException e1) {
+                       // TODO Auto-generated catch block
+                       e1.printStackTrace();
+               }
+               if (url.indexOf("channel")>-1){
+                       try { //subscriber-count" title="30" 
+                               String subscribersStr = 
StringUtils.substringBetween(content,"subscriber-count", "tabindex");
+                               String dirtyNumber = 
StringUtils.substringBetween(subscribersStr, "title=\"", "\"");
+                               String cleanNumber = 
dirtyNumber.replaceAll("[^\\x00-\\x7F]", "");
+                               if (cleanNumber!=null){
+                                       int subscribers = 
Integer.parseInt(cleanNumber );
+                                       result.subscribers = subscribers;
+                               } else {
+                                       System.err.println("Not found data for 
'subscriber-count', 'tabindex'");
+                               }
+                       } catch (NumberFormatException e) {
+                               // TODO Auto-generated catch block
+                               e.printStackTrace();
+                       }
+               } else {
+                       try {
+
+                               String subscribersStr = 
StringUtils.substringBetween(content,"subscriber-count", "tabindex");
+                               String dirtyNumber = 
StringUtils.substringBetween(subscribersStr, "title=\"", "\"").replace(" ", "");
+                               if (dirtyNumber!=null){
+                                       int subscribers = 
Integer.parseInt(dirtyNumber );
+                                       result.subscribers = subscribers;
+                               } else {
+                                       System.err.println("Not found data for 
'subscriber-count', 'tabindex'");
+                               }
+
+                               String viewsStrDirty = 
StringUtils.substringBetween(content,
+                                               //"div 
class=\"watch-view-count\">"," views</div>");
+                                               //view-count">12 
Ð¿ÑÐ¾ÑÐ¼Ð¾ÑÑÐ¾Ð²</div>
+                                               "view-count","<div>");
+                               String viewsStr = 
StringUtils.substringBetween(viewsStrDirty,">", " ");
+                               if (viewsStr!=null){
+                                       int views = Integer.parseInt(viewsStr );
+                                       result.views = views;
+                               } else {
+                                       System.err.println("Not found data for 
'view-count','<div>'");
+                               }
+                       } catch (NumberFormatException e) {
+                               // TODO Auto-generated catch block
+                               e.printStackTrace();
+                       }
+               }
+
+               return result;
+       }
+
+
+
+
+       public static void main(String[] args){
+               YouTubeMiner  miner = new YouTubeMiner();
+               
System.out.println(miner.getData("https://www.youtube.com/channel/UC-maQbG5eUS5c1wmaTnLwTA";));
+               
System.out.println(miner.getData("https://www.youtube.com/watch?v=U6X4VT9dVr8";));
+               
System.out.println(miner.getData("https://www.youtube.com/watch?v=kH-AQnta714";));
+               
System.out.println(miner.getData("https://www.youtube.com/watch?v=pWb50Kn1ShQ";));
+       }
+}
+
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
new file mode 100644
index 0000000..86c8e9d
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/YouTubeMinerResult.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+public class YouTubeMinerResult {
+       public int likes;
+       public int subscribers;
+       public int views;
+       
+       boolean isPromisingYoungIndividual(){
+               if (subscribers>0)
+                       if (subscribers>10 && subscribers< 20000)
+                               return true;
+               if (views>0)
+                       if (views>10 && views< 20000)
+                               return true;
+               return false;
+
+       }
+       
+       public String toString(){
+               return "views :"+ views + "| subscribers = "+ subscribers;
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
new file mode 100755
index 0000000..3c88b41
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/LinguisticPatternStructure.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.fca.ConceptLattice;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class LinguisticPatternStructure extends PhrasePatternStructure {
+
+       public LinguisticPatternStructure(int objectCounts, int 
attributeCounts) {
+               super(objectCounts, attributeCounts);
+               
+               ConceptLattice cl = null;
+       }
+       
+       public void AddExtentToAncestors(LinkedHashSet<Integer>extent, int 
curNode) {
+               //
+               if (conceptList.get(curNode).parents.size()>0){
+                       for (int parent : conceptList.get(curNode).parents){
+                               conceptList.get(parent).addExtents(extent);
+                               AddExtentToAncestors(extent, parent);
+                       }
+               }       
+       }
+       
+       public int AddIntent(List<List<ParseTreeChunk>> intent, 
LinkedHashSet<Integer>extent,int generator) {
+               System.out.println("debug");
+               System.out.println("called for " + intent);
+               //printLattice();
+               int generator_tmp = GetMaximalConcept(intent, generator);
+               generator = generator_tmp;
+               if (conceptList.get(generator).intent.equals(intent)) {
+                       System.out.println("at generator:" + 
conceptList.get(generator).intent);
+                       System.out.println("to add:" + intent);
+                       System.out.println("already generated");
+                       AddExtentToAncestors(extent, generator);        
+                       return generator;
+               }
+               Set<Integer> generatorParents = 
conceptList.get(generator).parents;
+               Set<Integer> newParents = new HashSet<Integer>();
+               for (int candidate : generatorParents) {
+                       if 
(!intent.containsAll(conceptList.get(candidate).intent)) {
+                               List<List<ParseTreeChunk>> intersection = md
+                               
.matchTwoSentencesGroupedChunksDeterministic(intent, 
conceptList.get(candidate).intent);
+                               LinkedHashSet<Integer> new_extent = new 
LinkedHashSet<Integer>();
+                               
new_extent.addAll(conceptList.get(candidate).extent);
+                               new_extent.addAll(extent);
+                               if (intent.size()!=intersection.size()){
+                                       System.out.println("recursive call 
(inclusion)");
+                                       System.out.println(intent + "----" + 
intersection);
+                                       candidate = 
AddIntent(intersection,new_extent, candidate);
+                               }
+                       }
+                       
+                       boolean addParents = true;
+                       System.out.println("now iterating over parents");
+                       Iterator<Integer> iterator = newParents.iterator();
+                       while (iterator.hasNext()) {
+                               Integer parent = iterator.next();
+                               if 
(conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) 
{
+                                       addParents = false;
+                                       break;
+                               }
+                               else {
+                                       if 
(conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) 
{
+                                               iterator.remove();
+                                       }
+                               }
+                       }
+                       if (addParents) {
+                               newParents.add(candidate);
+                       }
+               }
+               System.out.println("size of lattice: " + conceptList.size());
+               PhraseConcept newConcept = new PhraseConcept();
+               newConcept.setIntent(intent);
+
+               LinkedHashSet<Integer> new_extent = new 
LinkedHashSet<Integer>();
+               new_extent.addAll(conceptList.get(generator).extent);
+               new_extent.addAll(extent);
+               newConcept.addExtents(new_extent);
+               
+               newConcept.setPosition(conceptList.size());
+               conceptList.add(newConcept);
+               conceptList.get(generator).parents.add(newConcept.position);
+               conceptList.get(newConcept.position).childs.add(generator);
+               for (int newParent: newParents) {
+                       if 
(conceptList.get(generator).parents.contains(newParent)) {
+                               
conceptList.get(generator).parents.remove(newParent);
+                               
conceptList.get(newParent).childs.remove(generator);
+                       }
+                       
conceptList.get(newConcept.position).parents.add(newParent);
+                       conceptList.get(newParent).addExtents(new_extent);
+                       AddExtentToAncestors(new_extent, newParent);
+                       
conceptList.get(newParent).childs.add(newConcept.position);
+               }
+               return newConcept.position;
+       }
+       
+       public void printLatticeExtended() {
+               for (int i = 0; i < conceptList.size(); ++i) {
+                       printConceptByPositionExtended(i);
+               }
+       }
+       
+       public void printConceptByPositionExtended(int index) {
+               System.out.println("Concept at position " + index);
+               conceptList.get(index).printConceptExtended();
+       }
+       
+       
+       public int [][] toContext(int extentCardinality){
+               
+               int newAttrCount = conceptList.size();
+               ArrayList<PhraseConcept> cList = new ArrayList<PhraseConcept>();
+               cList.addAll(conceptList);      
+               boolean run = true;
+               int k=0;
+               while (run && k<conceptList.size()){
+                       if (conceptList.get(k).intent.size() == attributeCount){
+                               if (conceptList.get(k).extent.size() == 0)
+                                       for (Integer 
i:conceptList.get(k).parents)
+                                               cList.remove(i);
+                               cList.remove(k);
+                               run=false;
+                       }
+                       else
+                               k+=1;   
+               }
+               
+               run = true;
+               k=0;
+               while (run && k<=newAttrCount){
+                       if (cList.get(k).extent.size()==0)
+                               k++;
+                               run = false;
+               }
+               newAttrCount = cList.size();
+               Set<Integer> nodeExtend;
+               int [][] binaryContext = new 
int[extentCardinality][newAttrCount];
+               for (int j = 0; j<newAttrCount; j++){
+                       nodeExtend = cList.get(j).extent;
+                       for (Integer i: nodeExtend){
+                               binaryContext[i][j]=1;
+                       }
+               }
+               return binaryContext;
+       }
+       
+       
+       
+       public void logStability(){
+               int min_delta = -1, delta = -1;
+               float sum = 0;
+               for (int i = 0; i < conceptList.size(); ++i) {
+                       min_delta = Integer.MAX_VALUE;
+                       sum = 0;
+                       PhraseConcept pc = conceptList.get(i);
+                       Set<Integer> childs = pc.childs;
+                       for (Integer j: childs) {
+                               delta = pc.extent.size() - 
conceptList.get(j).extent.size();
+                               if (delta<min_delta)
+                                       min_delta = delta;
+                               sum += Math.pow(2, -delta);
+                       }
+                       pc.intLogStabilityBottom=-(Math.log(sum)/Math.log(2.0));
+                       pc.intLogStabilityUp = min_delta;
+               }
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureWriter.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureWriter.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureWriter.java
new file mode 100755
index 0000000..c3f5688
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureWriter.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+import opennlp.tools.fca.ConceptLattice;
+import opennlp.tools.fca.FormalConcept;
+
+public class PatternStructureWriter {
+       
+       public void WriteStatsToTxt(String filename, PhrasePatternStructure ps){
+                       
+               String formatStr = "[%5.2f; %5.2f]  %s   %s%n";
+               Writer writer = null;
+
+               try {
+                   writer = new BufferedWriter(new OutputStreamWriter(
+                         new FileOutputStream(filename), "utf-8"));
+                   writer.write("PatternStructure size: " + 
ps.conceptList.size()+ " with " + ps.objectCount + "objects\n");
+                   
+                   for (PhraseConcept c : ps.conceptList){
+                       
writer.write(String.format(formatStr,c.intLogStabilityBottom, 
c.intLogStabilityUp, c.extent, c.intent));
+                   }
+                   writer.close();
+                   
+               } catch (IOException ex) {
+                       System.err.println(ex.getMessage());
+               } finally {
+                  try {writer.close();} catch (Exception ex) {}
+               }
+       }
+       
+
+               public static void main(String[] args) {
+                       
+               }
+}
+                       
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java
new file mode 100644
index 0000000..e33e089
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.request_response_recognizer;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.external_rst.MatcherExternalRST;
+import opennlp.tools.parse_thicket.external_rst.ParseThicketWithDiscourseTree;
+import 
opennlp.tools.parse_thicket.kernel_interface.TreeKernelBasedClassifierMultiplePara;
+
+/*
+ * This class performs TK learning based on parse thicket which includes RST 
relations only 
+ * based on Surdeanu at al RST parser. It does sentence parsing and NLP 
pipeline of 
+ * Surdeanu's wrapper of Stanford NLP
+ */
+public class TreeKernelBasedRecognizerOfRequest_Response extends 
TreeKernelBasedClassifierMultiplePara{
+
+       private MatcherExternalRST matcherRST = new MatcherExternalRST();
+
+       protected List<String> 
formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
+               //TODO
+               this.setShortRun();     
+               List<String> extendedTreesDumpTotal = new ArrayList<String>();
+               try {
+
+                       for(String text: texts){
+                               // get the parses from original documents, and 
form the training dataset
+                               try {
+                                       System.out.print("About to build pt 
with external rst from "+text + "\n...");
+                                       ParseThicket pt = 
matcherRST.buildParseThicketFromTextWithRST(text);
+                                       if (pt == null)
+                                               continue;
+                                       System.out.print("About to build 
extended forest with external rst...");
+                                       List<String> extendedTreesDump =  // 
use direct option (true
+                                                       
buildReptresentationForDiscourseTreeAndExtensions((ParseThicketWithDiscourseTree)pt,
 true);
+                                       for(String line: extendedTreesDump)
+                                               extendedTreesDumpTotal.add(flag 
+ " |BT| "+line + " |ET| ");
+                                       System.out.println("DONE");
+                               } catch (Exception e) {
+                                       e.printStackTrace();
+                               }
+                       }
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }
+               return extendedTreesDumpTotal;
+       }
+
+       private List<String> 
buildReptresentationForDiscourseTreeAndExtensions(ParseThicketWithDiscourseTree 
pt, boolean bDirectDT){
+               List<String> extendedTreesDump = new ArrayList<String>();
+               if (!bDirectDT)
+                       // option 1: use RST relation for extended trees 
+                       extendedTreesDump = 
treeExtender.buildForestForRSTArcs(pt);
+               else {
+                       // option 2: use DT directly
+                       extendedTreesDump.add(pt.getDtDump());
+                   extendedTreesDump.add(pt.getDtDumpWithPOS());
+                   extendedTreesDump.add(pt.getDtDumpWithEmbeddedTrees());
+                   extendedTreesDump.add(pt.getDtDumpWithVerbNet());
+               }               
+               return extendedTreesDump;
+       }
+       
+       public static void main(String[] args){
+               VerbNetProcessor p = VerbNetProcessor.
+                               
getInstance("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources");
 
+
+               TreeKernelBasedRecognizerOfRequest_Response proc = new 
TreeKernelBasedRecognizerOfRequest_Response();
+               
proc.setKernelPath("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/tree_kernel/");
+               proc.trainClassifier(
+                               YahooAnswersTrainingSetCreator.origFilesDir,
+                               
YahooAnswersTrainingSetCreator.origFilesDir.replace("/text", "/neg_text")
+                               );
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/YahooAnswersTrainingSetCreator.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/YahooAnswersTrainingSetCreator.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/YahooAnswersTrainingSetCreator.java
new file mode 100644
index 0000000..c060c95
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/YahooAnswersTrainingSetCreator.java
@@ -0,0 +1,118 @@
+package opennlp.tools.parse_thicket.request_response_recognizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.similarity.apps.BingQueryRunner;
+
+import org.apache.commons.io.FileUtils;
+
+public class YahooAnswersTrainingSetCreator {
+       protected List<File> queuePos = new ArrayList<File>(), queueNeg = new 
ArrayList<File>();
+       public static String origFilesDir = 
"/Users/bgalitsky/Downloads/NewCategoryIdentification/text";
+       //private BingQueryRunner searcher = new BingQueryRunner();
+       protected void addFilesPos(File file) {
+
+               if (!file.exists()) {
+                       System.out.println(file + " does not exist.");
+               }
+               if (file.isDirectory()) {
+                       for (File f : file.listFiles()) {
+                               addFilesPos(f);
+                               System.out.println(f.getName());
+                       }
+               } else {
+                       queuePos.add(file);
+               }
+       }
+       
+       protected void addFilesNeg(File file) {
+
+               if (!file.exists()) {
+                       System.out.println(file + " does not exist.");
+               }
+               if (file.isDirectory()) {
+                       for (File f : file.listFiles()) {
+                               addFilesNeg(f);
+                               System.out.println(f.getName());
+                       }
+               } else {
+                       queueNeg.add(file);
+               }
+       }
+       
+       public void formNegTrainingSet(String posPath , String negPath){
+                if (!new File(negPath).exists())
+                        new File(negPath).mkdir();
+                
+               addFilesPos(new File(posPath));
+               for(int i=0; i< queuePos.size()-1; i+=2){ //take two files at a 
time
+                       File f1 = queuePos.get(i), f2 = queuePos.get(i+1);
+                       String content1 = null, content2 = null;
+            try {
+                   content1 = FileUtils.readFileToString(f1);
+                   content2 = FileUtils.readFileToString(f2);
+            } catch (IOException e) {
+                   e.printStackTrace();
+            }
+                       String[] portions1 = content1.split("\n\n");
+                       String[] portions2 = content2.split("\n\n");
+
+                       portions1 = splitIntoRR(portions1, content1);
+                       portions2 = splitIntoRR(portions2, content2);
+                       if (portions1==null || portions2==null)
+                               continue;
+                       // do cross-breeding
+                       try {
+                   FileUtils.writeStringToFile(new File(negPath+"/" + 
f1.getName()+".txt"),
+                               portions1[0] + "\n\n" + portions2[1] );
+                   FileUtils.writeStringToFile(new File(negPath+"/" + 
f2.getName()+".txt"),
+                               portions2[0] + "\n\n" + portions1[1] );
+            } catch (IOException e) {
+                   e.printStackTrace();
+            }
+               }
+               
+               
+       }
+       private String[] splitIntoRR(String[] portions, String content) {
+               if (portions.length<2 ){
+                       portions = content.replace("?","#_#").split("#_#");
+               }
+               if (portions.length<2 ){
+                       portions = content.split("\n");
+               }
+               if (portions.length<2)
+                       return null;
+               if (portions.length>2){
+                       String q= "", a = "";
+                       boolean bQ = true;
+                       for(int p=0; p<portions.length; p++){
+                               if ( bQ )
+                                       q+=portions[p]+" \n";
+                               else
+                                       a +=portions[p]+" \n";
+                               
+                               if (portions[p].endsWith("?")){
+                                       bQ=false;
+                               }
+
+                       }
+                       if (!bQ) {
+                               portions = new String[2];
+                               portions[0] = q;
+                               portions[1] = a;
+                       } else
+                               return null;
+               }
+               
+               return portions;
+    }
+       
+       public static void main(String[] args){
+               String dir = YahooAnswersTrainingSetCreator.origFilesDir;
+               new YahooAnswersTrainingSetCreator().formNegTrainingSet(dir, 
dir.replace("/text", "/neg_text"));
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingRelatedSpellingQueryRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingRelatedSpellingQueryRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingRelatedSpellingQueryRunner.java
new file mode 100644
index 0000000..025403c
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingRelatedSpellingQueryRunner.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.ArrayList;
+import java.util.List;
+
+import net.billylieurance.azuresearch.AzureSearchRelatedSearchQuery;
+import net.billylieurance.azuresearch.AzureSearchRelatedSearchResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchSpellingSuggestionQuery;
+import net.billylieurance.azuresearch.AzureSearchSpellingSuggestionResult;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+public class BingRelatedSpellingQueryRunner extends BingQueryRunner{
+       private AzureSearchRelatedSearchQuery aq = new 
AzureSearchRelatedSearchQuery ();
+       private AzureSearchSpellingSuggestionQuery  ssq = new 
AzureSearchSpellingSuggestionQuery ();
+       
+       
+       public List<HitBase> runSearch(String query, int nRes) {
+               aq.setAppid(BING_KEY);
+               aq.setQuery(query);             
+               aq.setPerPage(nRes);
+               aq.doQuery();
+               
+               List<HitBase> results = new ArrayList<HitBase> ();
+               AzureSearchResultSet<AzureSearchRelatedSearchResult> ars = 
aq.getQueryResult();
+               
+               for (AzureSearchRelatedSearchResult anr : ars){
+                   HitBase h = new HitBase();
+                   h.setTitle(anr.getTitle());
+                   h.setUrl(anr.getBingUrl());
+                   results.add(h);
+               }
+               return results;
+       }
+       
+       public List<HitBase> runSSSearch(String query, int nRes) {
+               ssq.setAppid(BING_KEY);
+               ssq.setQuery(query);            
+               ssq.setPerPage(nRes);
+               ssq.doQuery();
+               
+               List<HitBase> results = new ArrayList<HitBase> ();
+               AzureSearchResultSet<AzureSearchSpellingSuggestionResult> ars = 
ssq.getQueryResult();
+               
+               for ( AzureSearchSpellingSuggestionResult anr : ars){
+                   HitBase h = new HitBase();
+                   h.setTitle(anr.getTitle());
+                   h.setAbstractText(anr.getValue());
+                  
+                   results.add(h);
+               }
+               return results;
+       }
+       
+       public static void main(String[] args) {
+               BingRelatedSpellingQueryRunner self = new 
BingRelatedSpellingQueryRunner();
+           try {
+               self.setLang("es-MX");
+               self.setKey(
+                               "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=");
+             List<HitBase> resp = self
+                 .runSearch("clear Sess", 10);
+             System.out.print(resp.get(0));
+             
+             resp = self
+                         .runSSSearch("clear Sess", 10);
+                     System.out.print(resp.get(0));
+           } catch (Exception e) {
+             // TODO Auto-generated catch block
+             e.printStackTrace();
+           }
+       }
+}

[45/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

Reply via email to