[46/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

bgalitsky Wed, 16 Nov 2016 01:11:40 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java
new file mode 100644
index 0000000..a5e1ee7
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+
+
+
+
+
+
+
+
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.textsimilarity.GeneralizationListReducer;
+import opennlp.tools.textsimilarity.LemmaFormManager;
+import opennlp.tools.textsimilarity.POSManager;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class NERPhraseGeneralizer extends PhraseGeneralizer {
+
+       /* alignment is based on NER values, not on POS now
+        * 
+        */
+
+
+       /**
+        * key matching function which takes two phrases, aligns them and finds 
a set
+        * of maximum common sub-phrase
+        * 
+        * @param chunk1
+        * @param chunk2
+        * @return
+        */
+       @Override
+       public List<ParseTreeChunk> generalize(
+                       Object chunk1o, Object chunk2o) {
+
+               ParseTreeChunk chunk1 = (ParseTreeChunk)chunk1o, chunk2 = 
(ParseTreeChunk)chunk2o;
+               List<ParseTreeNode> results = new ArrayList<ParseTreeNode>();
+               List<ParseTreeChunk> resultChunks = new 
ArrayList<ParseTreeChunk>();
+
+
+               List<String> pos1 = chunk1.getPOSs();
+               List<String> pos2 = chunk2.getPOSs();
+               List<String> lem1 = chunk1.getLemmas();
+               List<String> lem2 = chunk2.getLemmas();
+
+               List<String> ner1 = new ArrayList<String>();
+               List<String> ner2 = new ArrayList<String>();
+
+
+               for (ParseTreeNode node: chunk1.getParseTreeNodes()) {
+                       if (node.getNe()!=null && !node.getNe().equals("O"))
+                               ner1.add(node.getNe());
+               }
+
+               for (ParseTreeNode node: chunk2.getParseTreeNodes()) {
+                       if (node.getNe()!=null && !node.getNe().equals("O"))
+                               ner2.add(node.getNe());
+               }
+
+
+               List<String> overlap = new ArrayList<String>(ner1);
+               overlap.retainAll(ner2);
+               overlap = new ArrayList<String>(new HashSet<String>(overlap));
+
+
+               if (overlap == null || overlap.size() < 1)
+                       return null;
+
+               List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new 
ArrayList<Integer>();
+               for (String word : overlap) {
+                       Integer i1 = ner1.indexOf(word);
+                       Integer i2 = ner2.indexOf(word);
+                       occur1.add(i1);
+                       occur2.add(i2);
+               }
+
+
+               // for verbs find alignment even if no same verb lemmas, just 
any pair of verbs. Usually should be 0,0
+               if (chunk1.getMainPOS().startsWith("VP") && 
chunk2.getMainPOS().startsWith("VP")) {
+                       Integer i1 = null, i2 = null;
+                       for(int i=0; i< pos1.size(); i++){
+                               if (pos1.get(i).startsWith("VB")){
+                                       i1 = i;
+                                       break;
+                               }
+                       }
+
+                       for(int i=0; i< pos2.size(); i++){
+                               if (pos2.get(i).startsWith("VB")){
+                                       i2 = i;
+                                       break;
+                               }
+                       }
+                       occur1.add(i1);
+                       occur2.add(i2);
+               }
+               // now we search for plausible sublists of overlaps
+               // if at some position correspondence is inverse (one of two 
position
+               // decreases instead of increases)
+               // then we terminate current alignment accum and start a new one
+               List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
+               // starts from 1, not 0
+               List<int[]> accum = new ArrayList<int[]>();
+               accum.add(new int[] { occur1.get(0), occur2.get(0) });
+               for (int i = 1; i < occur1.size(); i++) {
+
+                       if (occur1.get(i) > occur1.get(i - 1)
+                                       && occur2.get(i) > occur2.get(i - 1))
+                               accum.add(new int[] { occur1.get(i), 
occur2.get(i) });
+                       else {
+                               overlapsPlaus.add(accum);
+                               if (occur1!=null && occur2!=null && 
i<occur1.size() &&  i<occur2.size() ){
+                                       accum = new ArrayList<int[]>();
+                                       accum.add(new int[] { occur1.get(i), 
occur2.get(i) });
+                               }
+                       }
+               }
+               if (accum.size() > 0) {
+                       overlapsPlaus.add(accum);
+               }
+
+
+               for (List<int[]> occur : overlapsPlaus) {
+                       List<Integer> occr1 = new ArrayList<Integer>(), occr2 = 
new ArrayList<Integer>();
+                       for (int[] column : occur) {
+                               occr1.add(column[0]);
+                               occr2.add(column[1]);
+                       }
+
+                       int ov1 = 0, ov2 = 0; // iterators over common words;
+                       List<String> commonPOS = new ArrayList<String>(), 
commonLemmas = new ArrayList<String>();
+                       // we start two words before first word
+                       int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
+                       Boolean bReachedCommonWord = false;
+                       while (k1 < 0 || k2 < 0) {
+                               k1++;
+                               k2++;
+                       }
+                       int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
+                       while (k1 <= k1max && k2 <= k2max) {
+                               /*        // first check if the same POS
+        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
+        String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
+            lem2.get(k2), sim);
+                                */      
+                               String sim = null;
+                               List<String> sims = 
posManager.//similarPOS(pos1.get(k1), pos2.get(k2));
+                                               generalize(pos1.get(k1), 
pos2.get(k2));
+                               if (!sims.isEmpty())
+                                       sim = sims.get(0);
+
+                               String lemmaMatch = null;               
+                               List<String> lemmaMatchs = 
lemmaFormManager.//matchLemmas(ps, 
+                                               generalize(lem1.get(k1),
+                                                               lem2.get(k2));
+                               if (!lemmaMatchs.isEmpty())
+                                       lemmaMatch = lemmaMatchs.get(0);
+
+
+
+                               if ((sim != null)
+                                               && (lemmaMatch == null || 
(lemmaMatch != null && !lemmaMatch
+                                               .equals("fail")))) {
+                                       commonPOS.add(pos1.get(k1));
+
+
+                                       // doing parse tree node generalization
+                                       List<ParseTreeNode> genRes =  
nodeGen.generalize(chunk1.getParseTreeNodes().get(k1), 
chunk2.getParseTreeNodes().get(k2)); 
+                                       if (genRes.size()==1)
+                                               results.add(genRes.get(0));
+
+                                       if (lemmaMatch != null) {
+                                               commonLemmas.add(lemmaMatch);
+                                               // System.out.println("Added 
"+lemmaMatch);
+                                               if (k1 == occr1.get(ov1) && k2 
== occr2.get(ov2))
+                                                       bReachedCommonWord = 
true; // now we can have different increment
+                                               // opera
+                                               else {
+                                                       if (occr1.size() > ov1 
+ 1 && occr2.size() > ov2 + 1
+                                                                       && k1 
== occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
+                                                               ov1++;
+                                                               ov2++;
+                                                               
bReachedCommonWord = true;
+                                                       }
+                                                       // else
+                                                               // 
System.err.println("Next match reached '"+lemmaMatch+
+                                                       // "' | k1 - k2: "+k1 + 
" "+k2 +
+                                                       // "| occur index 
ov1-ov2 "+
+                                                       // ov1+" "+ov2+
+                                                       // "| identified 
positions of match: occr1.get(ov1) - occr2.get(ov1) "
+                                                       // +
+                                                       // occr1.get(ov1) + " 
"+ occr2.get(ov1));
+                                               }
+                                       } else {
+                                               commonLemmas.add("*");
+                                       } // the same parts of speech, proceed 
to the next word in both
+                                       // expressions
+                                       k1++;
+                                       k2++;
+
+                               } else if (!bReachedCommonWord) {
+                                       k1++;
+                                       k2++;
+                               } // still searching
+                               else {
+                                       // different parts of speech, jump to 
the next identified common word
+                                       ov1++;
+                                       ov2++;
+                                       if (ov1 > occr1.size() - 1 || ov2 > 
occr2.size() - 1)
+                                               break;
+                                       // now trying to find
+                                       int kk1 = occr1.get(ov1) - 2, // new 
positions of iterators
+                                                       kk2 = occr2.get(ov2) - 
2;
+                                       int countMove = 0;
+                                       while ((kk1 < k1 + 1 || kk2 < k2 + 1) 
&& countMove < 2) { // if it is
+                                               // behind
+                                               // current
+                                               // position,
+                                               // synchroneously
+                                               // move
+                                               // towards
+                                               // right
+                                               kk1++;
+                                               kk2++;
+                                               countMove++;
+                                       }
+                                       k1 = kk1;
+                                       k2 = kk2;
+
+                                       if (k1 > k1max)
+                                               k1 = k1max;
+                                       if (k2 > k2max)
+                                               k2 = k2max;
+                                       bReachedCommonWord = false;
+                               }
+                       }
+                       ParseTreeChunk currResult = new ParseTreeChunk(results),
+                                       currResultOld = new 
ParseTreeChunk(commonLemmas, commonPOS, 0, 0);
+
+
+                       resultChunks.add(currResult);
+               }
+
+               return resultChunks;
+       }
+
+}


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeNodeGeneralizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeNodeGeneralizer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeNodeGeneralizer.java
new file mode 100644
index 0000000..8001a7b
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeNodeGeneralizer.java
@@ -0,0 +1,68 @@
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+
+public class ParseTreeNodeGeneralizer implements IGeneralizer<ParseTreeNode>{
+       private LemmaGeneralizer lGen = new LemmaGeneralizer ();
+       private PartOfSpeechGeneralizer posGen = new PartOfSpeechGeneralizer ();
+       private VerbNetProcessor vnProc = VerbNetProcessor.getInstance(null);
+
+       @Override
+       public List<ParseTreeNode> generalize(Object o1, Object o2) {
+
+               List<ParseTreeNode> results = new ArrayList<ParseTreeNode>();
+
+               ParseTreeNode newNode = null;
+               ParseTreeNode ch1 = (ParseTreeNode)o1, ch2 = (ParseTreeNode)o2;
+               List<String> posGenStrList =  posGen.generalize(ch1.getPos(), 
ch2.getPos());
+               if (!posGenStrList.isEmpty()){
+                       List<String> lemmaGen = lGen.generalize(ch1.getWord(), 
ch2.getWord());
+                       if (!lemmaGen.isEmpty())
+                               newNode = new ParseTreeNode(lemmaGen.get(0),  
posGenStrList.get(0), "O", -1);
+                       else
+                               newNode = new ParseTreeNode("*",  
posGenStrList.get(0), "O", -1);
+               }
+               
+               newNode.setPhraseType(ch1.getPhraseType());
+               //TODO separate NER generalizer
+               //TODO multiword generalizer
+               if (posGenStrList.get(0).startsWith("NN")){
+                       if (ch1.getNe()!=null && ch2.getNe()!=null && 
ch1.getNe().equals(ch2.getNe()))
+                                       newNode.setNe(ch1.getNe());
+               }
+               if (posGenStrList.get(0).startsWith("VB")){     
+                       List<Map<String, List<String>>> verbNetGenList = vnProc 
.generalize(ch1.getWord(), ch2.getWord());
+                       if (verbNetGenList.size()>0){
+                               Map<String, List<String>> verbNetGen = 
verbNetGenList.get(0);
+                               Map<String, Object> attr = 
newNode.getAttributes();
+                               if (attr == null)
+                                       attr = new HashMap<String, Object> ();
+                               try {
+                                       List<String> phrDscr = (List<String>) 
attr.get("phrDescr");
+                                       if (phrDscr!=null) // && 
phrDscr.size()>1)
+                                               phrDscr = new 
ArrayList<String>(new HashSet<String>(phrDscr));
+                               } catch (Exception e) {
+                                       System.err.println("Problem 
de-duplicating verbnet expr" + attr);
+                               }
+                               if (verbNetGen!=null){
+                                       attr.putAll(verbNetGen);
+                                       newNode.setAttributes(attr);
+                               }
+                       }
+               } else if (posGenStrList.get(0).startsWith("NN")){
+                       //TODO
+               }
+
+               results.add(newNode);
+               return results;
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PartOfSpeechGeneralizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PartOfSpeechGeneralizer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PartOfSpeechGeneralizer.java
new file mode 100644
index 0000000..66efe23
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PartOfSpeechGeneralizer.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+
+public class PartOfSpeechGeneralizer implements IGeneralizer<String>{
+
+       @Override
+       public List<String> generalize(Object o1, Object o2){
+               String pos1 = (String)o1, pos2 =  (String) o2;
+               List<String>  results = new ArrayList<String>();
+               String res = computeSimilarity(pos1, pos2);
+               if (res!=null)
+                       results.add(res);
+               return results;
+
+       }
+       private String computeSimilarity(String pos1, String pos2){
+
+               if ((pos1.startsWith("NN") && pos2.equals("NP") || 
pos2.startsWith("NN")
+                               && pos1.equals("NP"))) {
+                       return "NN";
+               }
+               if ((pos1.startsWith("NN") && pos2.equals("VBG") || 
pos2.startsWith("VBG")
+                               && pos1.equals("NN"))) {
+                       return "NN";
+               }
+
+               if ((pos1.startsWith("NN") && pos2.equals("ADJP") || 
pos2.startsWith("NN")
+                               && pos1.equals("ADJP"))) {
+                       return "NN";
+               }
+               if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO")
+                               && pos2.equals("IN"))) {
+                       return "IN";
+               }
+               // VBx vs VBx = VB (does not matter which form for verb)
+               if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
+                       return "VB";
+               }
+
+               // ABx vs ABy always gives AB
+               if (pos1.equalsIgnoreCase(pos2)) {
+                       return pos1;
+               }
+               if (pos1.length() > 2) {
+                       pos1 = pos1.substring(0, 2);
+               }
+
+               if (pos2.length() > 2) {
+                       pos2 = pos2.substring(0, 2);
+               }
+               if (pos1.equalsIgnoreCase(pos2)) {
+                       return pos1 + "*";
+               }
+               return null;
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PersonalInformationExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PersonalInformationExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PersonalInformationExtractor.java
new file mode 100644
index 0000000..5df0dee
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PersonalInformationExtractor.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+
+public class PersonalInformationExtractor {
+       FrameQueryBasedIExtractor extractor = new FrameQueryBasedIExtractor();
+       private ArrayList<File> queue = new ArrayList<File>();
+       private Tika tika = new Tika();
+
+       public void runExtractor(String filename){
+               String content = null;
+               try {
+                       content = FileUtils.readFileToString(new 
File(filename));
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+
+
+               extractor.buildTemplates(new String[] { "John Doe send his 
California driver license 1234567 . "
+                               + "Jill Jones received her Ohio license 
4567456. ", 
+                               " Mary Poppins got her identification 8765. 
Jorge Malony sold his identification 9876. ",
+                               //" President Jorge Smith of Microsoft used his 
id 4567. Manager John Smith of Google used his id 8765. "
+                               " Johh Doe 123. Don Joe 1323. "
+
+               });
+
+               List<GeneralizationResult>  res = extractor.doIE( content);
+
+       }
+
+
+       private void addFiles(File file) {
+
+               try {
+                       if (!file.exists()) {
+                               System.out.println(file + " does not exist.");
+                       }
+                       if (file.isDirectory()) {
+                               for (File f : file.listFiles()) {
+                                       try {
+                                               addFiles(f);
+                                       } catch (Exception e) {
+                                       }
+                               }
+                       } else {
+                               queue.add(file);
+                       }
+               } catch (Exception e) {
+
+               }
+       }
+
+       public void processDirectory(String filename, String template) throws 
IOException {
+               List<String[]> report = new ArrayList<String[]>(); 
+               report.add(new String[]{"filename", "text",  "generalization", 
"fired?" });
+               String templateStr = null;
+               try {
+
+                       templateStr =  FileUtils.readFileToString(new 
File(template));
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+
+               String[] samples = templateStr.split("&");
+
+               extractor.buildTemplates(samples);
+
+               addFiles(new File(filename));
+
+
+               for (File f : queue) {
+                       String content=null;
+                       try {
+                               content = tika.parseToString(f);
+                               List<GeneralizationResult>  res = 
extractor.doIE( content);
+
+                               for(GeneralizationResult gr: res){
+                                       report.add(new String[]{filename, 
gr.getText(),  gr.getGen().toString(), gr.getbFire().toString() });
+                               }
+
+                       } catch (TikaException e) {
+                               System.out.println("Tika problem with file" + 
f.getAbsolutePath());
+                       } catch (Exception ee){
+                               ee.printStackTrace();
+                       }
+                       ProfileReaderWriter.writeReport(report, 
"PII_report.csv");
+               }
+
+               queue.clear();
+       }
+
+
+       public void runExtractor(String filename, String template){
+               String content = null, templateStr = null;
+               try {
+                       content = FileUtils.readFileToString(new 
File(filename));
+                       templateStr =  FileUtils.readFileToString(new 
File(template));
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+
+               String[] samples = templateStr.split("&");
+
+               extractor.buildTemplates(samples);
+
+               List<GeneralizationResult>  res = extractor.doIE( content);
+               List<String[]> report = new ArrayList<String[]>();
+
+               for(GeneralizationResult gr: res){
+                       report.add(new String[]{filename, gr.getText(),  
gr.getGen().toString(), gr.getbFire().toString() });
+               }
+
+
+       }
+
+       public static void main(String[] args){
+               //String filename = 
"/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/pii/agreement.txt";
+               
+               if (args ==null || args.length!=3)
+                       System.err.println("Usage: java -Xmx10g -jar *.jar 
path-to-resources path-to-file-to-analyze path-to-file-with_samples\n");
+               try {
+                       VerbNetProcessor.getInstance(args[0]);
+                       new PersonalInformationExtractor().processDirectory( 
args[1], args[2]);
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
new file mode 100644
index 0000000..61d5f0f
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.List;
+
+
+
+
+
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.textsimilarity.GeneralizationListReducer;
+import opennlp.tools.textsimilarity.LemmaFormManager;
+import opennlp.tools.textsimilarity.POSManager;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class PhraseGeneralizer implements IGeneralizer<ParseTreeChunk> {
+
+       private GeneralizationListReducer generalizationListReducer = new 
GeneralizationListReducer();
+       protected LemmaGeneralizer lemmaFormManager = new LemmaGeneralizer();
+       //protected LemmaFormManager lemmaFormManager = new LemmaFormManager();
+       
+       
+       protected PartOfSpeechGeneralizer posManager = new 
PartOfSpeechGeneralizer();
+       
+       protected PStemmer ps = new PStemmer();
+       protected ParseTreeNodeGeneralizer nodeGen = new 
ParseTreeNodeGeneralizer();
+
+       /**
+        * key matching function which takes two phrases, aligns them and finds 
a set
+        * of maximum common sub-phrase
+        * 
+        * @param chunk1
+        * @param chunk2
+        * @return
+        */
+       @Override
+       public List<ParseTreeChunk> generalize(
+                       Object chunk1o, Object chunk2o) {
+
+               ParseTreeChunk chunk1 = (ParseTreeChunk)chunk1o, chunk2 = 
(ParseTreeChunk)chunk2o;
+
+               List<ParseTreeChunk> resultChunks = new 
ArrayList<ParseTreeChunk>();
+
+
+               List<String> pos1 = chunk1.getPOSs();
+               List<String> pos2 = chunk2.getPOSs();
+               List<String> lem1 = chunk1.getLemmas();
+               List<String> lem2 = chunk2.getLemmas();
+
+               List<String> lem1stem = new ArrayList<String>();
+               List<String> lem2stem = new ArrayList<String>();
+
+
+               for (String word : lem1) {
+                       try {
+                               
lem1stem.add(ps.stem(word.toLowerCase()).toString());
+                       } catch (Exception e) {
+                               // e.printStackTrace();
+
+                               if (word.length() > 2)
+                                       System.err.println("Unable to stem: " + 
word);
+                       }
+               }
+               try {
+                       for (String word : lem2) {
+                               
lem2stem.add(ps.stem(word.toLowerCase()).toString());
+                       }
+               } catch (Exception e) {
+                       System.err.println("problem processing word " + 
lem2.toString());
+               }
+
+               List<String> overlap = new ArrayList<String>(lem1stem);
+               overlap.retainAll(lem2stem);
+
+               if (overlap == null || overlap.size() < 1)
+                       return null;
+
+               // to accumulate starts of alignments
+               List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new 
ArrayList<Integer>();
+
+               // for verbs find alignment even if no same verb lemmas, just 
any pair of verbs. Usually should be 0,0
+               if (chunk1.getMainPOS().startsWith("VP") && 
chunk2.getMainPOS().startsWith("VP")) {
+                       Integer i1 = null, i2 = null;
+                       for(int i=0; i< pos1.size(); i++){
+                               if (pos1.get(i).startsWith("VB")){
+                                       i1 = i;
+                                       break;
+                               }
+                       }
+
+                       for(int i=0; i< pos2.size(); i++){
+                               if (pos2.get(i).startsWith("VB")){
+                                       i2 = i;
+                                       break;
+                               }
+                       }
+                       if (i1!=null)
+                               occur1.add(i1);
+                       if (i2!=null)
+                               occur2.add(i2);
+               }
+
+
+               for (String word : overlap) {
+                       Integer i1 = lem1stem.indexOf(word);
+                       Integer i2 = lem2stem.indexOf(word);
+                       occur1.add(i1);
+                       occur2.add(i2);
+               }
+
+
+               // now we search for plausible sublists of overlaps
+               // if at some position correspondence is inverse (one of two 
position
+               // decreases instead of increases)
+               // then we terminate current alignment accum and start a new one
+               List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
+               // starts from 1, not 0
+               List<int[]> accum = new ArrayList<int[]>();
+               accum.add(new int[] { occur1.get(0), occur2.get(0) });
+               for (int i = 1; i < occur1.size() && i< occur2.size(); i++) {
+
+                       if (occur1.get(i) > occur1.get(i - 1)
+                                       && occur2.get(i) > occur2.get(i - 1))
+                               accum.add(new int[] { occur1.get(i), 
occur2.get(i) });
+                       else {
+                               overlapsPlaus.add(accum);
+                               accum = new ArrayList<int[]>();
+                               accum.add(new int[] { occur1.get(i), 
occur2.get(i) });
+                       }
+               }
+               if (accum.size() > 0) {
+                       overlapsPlaus.add(accum);
+               }
+
+
+               for (List<int[]> occur : overlapsPlaus) {
+                       List<ParseTreeNode> results = new 
ArrayList<ParseTreeNode>();
+                       List<Integer> occr1 = new ArrayList<Integer>(), occr2 = 
new ArrayList<Integer>();
+                       for (int[] column : occur) {
+                               occr1.add(column[0]);
+                               occr2.add(column[1]);
+                       }
+
+                       int ov1 = 0, ov2 = 0; // iterators over common words;
+                       List<String> commonPOS = new ArrayList<String>(), 
commonLemmas = new ArrayList<String>();
+                       // we start two words before first word
+                       int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
+                       // if (k1<0) k1=0; if (k2<0) k2=0;
+                       Boolean bReachedCommonWord = false;
+                       while (k1 < 0 || k2 < 0) {
+                               k1++;
+                               k2++;
+                       }
+                       int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
+                       while (k1 <= k1max && k2 <= k2max) {
+                               // first check if the same POS
+                               String sim = null;
+                               List<String> sims = 
posManager.//similarPOS(pos1.get(k1), pos2.get(k2));
+                                               generalize(pos1.get(k1), 
pos2.get(k2));
+                               if (!sims.isEmpty())
+                                       sim = sims.get(0);
+                               
+                               String lemmaMatch = null;               
+                               List<String> lemmaMatchs = 
lemmaFormManager.//matchLemmas(ps, 
+                                               generalize(lem1.get(k1),
+                                               lem2.get(k2));
+                               if (!lemmaMatchs.isEmpty())
+                                       lemmaMatch = lemmaMatchs.get(0);
+                               
+
+                               if ((sim != null)
+                                               && (lemmaMatch == null || 
(lemmaMatch != null ))) {
+                                       commonPOS.add(pos1.get(k1));
+
+
+                                       // doing parse tree node generalization
+                                       List<ParseTreeNode> genRes =  
nodeGen.generalize(chunk1.getParseTreeNodes().get(k1), 
chunk2.getParseTreeNodes().get(k2)); 
+                                       if (genRes.size()==1)
+                                               results.add(genRes.get(0));
+
+                                       if (lemmaMatch != null) {
+                                               commonLemmas.add(lemmaMatch);
+                                               // System.out.println("Added 
"+lemmaMatch);
+                                               if (k1 == occr1.get(ov1) && k2 
== occr2.get(ov2))
+                                                       bReachedCommonWord = 
true; // now we can have different increment
+                                               // opera
+                                               else {
+                                                       if (occr1.size() > ov1 
+ 1 && occr2.size() > ov2 + 1
+                                                                       && k1 
== occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
+                                                               ov1++;
+                                                               ov2++;
+                                                               
bReachedCommonWord = true;
+                                                       }
+                                               }
+                                       } else {
+                                               commonLemmas.add("*");
+                                       } // the same parts of speech, proceed 
to the next word in both
+                                       // expressions
+                                       k1++;
+                                       k2++;
+
+                               } else if (!bReachedCommonWord) {
+                                       k1++;
+                                       k2++;
+                               } // still searching
+                               else {
+                                       // different parts of speech, jump to 
the next identified common word
+                                       ov1++;
+                                       ov2++;
+                                       if (ov1 > occr1.size() - 1 || ov2 > 
occr2.size() - 1)
+                                               break;
+                                       // now trying to find
+                                       int kk1 = occr1.get(ov1) - 2, // new 
positions of iterators
+                                                       kk2 = occr2.get(ov2) - 
2;
+                                       int countMove = 0;
+                                       while ((kk1 < k1 + 1 || kk2 < k2 + 1) 
&& countMove < 2) { // if it is
+                                               // behind
+                                               // current
+                                               // position,
+                                               // synchroneously
+                                               // move
+                                               // towards
+                                               // right
+                                               kk1++;
+                                               kk2++;
+                                               countMove++;
+                                       }
+                                       k1 = kk1;
+                                       k2 = kk2;
+
+                                       if (k1 > k1max)
+                                               k1 = k1max;
+                                       if (k2 > k2max)
+                                               k2 = k2max;
+                                       bReachedCommonWord = false;
+                               }
+                       }
+                       ParseTreeChunk currResult = new ParseTreeChunk(results);
+                       //currResultOld = new ParseTreeChunk(commonLemmas, 
commonPOS, 0, 0);
+
+
+                       resultChunks.add(currResult);
+               }
+
+               return resultChunks;
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGroupGeneralizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGroupGeneralizer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGroupGeneralizer.java
new file mode 100644
index 0000000..094f093
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGroupGeneralizer.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.textsimilarity.GeneralizationListReducer;
+import opennlp.tools.textsimilarity.LemmaFormManager;
+import opennlp.tools.textsimilarity.POSManager;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class PhraseGroupGeneralizer implements 
IGeneralizer<List<ParseTreeChunk>>{
+
+  private GeneralizationListReducer generalizationListReducer = new 
GeneralizationListReducer();
+
+  private LemmaFormManager lemmaFormManager = new LemmaFormManager();
+
+  private POSManager posManager = new POSManager();
+
+  private PhraseGeneralizer pGen = new PhraseGeneralizer();
+  private NERPhraseGeneralizer pGenNER = new NERPhraseGeneralizer();
+
+  /**
+   * main function to generalize two expressions grouped by phrase types 
returns
+   * a list of generalizations for each phrase type with filtered
+   * sub-expressions
+   * 
+   * @param sent1
+   * @param sent2
+   * @return List<List<ParseTreeChunk>> list of list of POS-words pairs for 
each
+   *         resultant matched / overlapped phrase
+   */
+  @Override
+  public List<List<ParseTreeChunk>> generalize(Object o1, Object o2) {
+         
+  
+      List<List<ParseTreeChunk>> sent1 = (List<List<ParseTreeChunk>>)o1, 
+        sent2 = (List<List<ParseTreeChunk>>) o2 ;
+    List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+    // first iterate through component
+    for (int comp = 0; comp < 2 && // just np & vp
+        comp < sent1.size() && comp < sent2.size(); comp++) {
+      List<ParseTreeChunk> resultComps = new ArrayList<ParseTreeChunk>();
+      // then iterate through each phrase in each component
+      // first try lemma-based alignment
+      for (ParseTreeChunk ch1 : sent1.get(comp)) {
+        for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version
+          List<ParseTreeChunk> chunkToAdd=null;
+               try {
+                       chunkToAdd = pGen.generalize(ch1, ch2);
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+          if (chunkToAdd == null){
+                    chunkToAdd = new ArrayList<ParseTreeChunk>();
+          }       
+          Boolean alreadyThere = false;
+          for (ParseTreeChunk chunk : resultComps) {
+            if (chunkToAdd.contains(chunk)) {
+              alreadyThere = true;
+              break;
+            }
+          }
+
+          if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) {
+            resultComps.addAll(chunkToAdd);
+          }
+
+        }
+      } // then try NER-based alignment
+      if (comp==0 || resultComps.size()<1){
+         for (ParseTreeChunk ch1 : sent1.get(comp)) {
+               for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version
+                 List<ParseTreeChunk> chunkToAdd = pGenNER.generalize(
+                     ch1, ch2);
+
+                 if (chunkToAdd == null){
+                          chunkToAdd = new ArrayList<ParseTreeChunk>();
+                 }
+                
+                 Boolean alreadyThere = false;
+                 for (ParseTreeChunk chunk : resultComps) {
+                   if (chunkToAdd.contains(chunk)) {
+                     alreadyThere = true;
+                     break;
+                   }
+                 }
+
+                 if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() 
> 0) {
+                   resultComps.addAll(chunkToAdd);
+                 }
+
+               }
+             }
+      }
+      
+      List<ParseTreeChunk> resultCompsRed = 
generalizationListReducer.applyFilteringBySubsumption(resultComps);
+
+      resultComps = resultCompsRed;
+      results.add(resultComps);
+    }
+
+    return results;
+  }
+
+  
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/AbstractEngineRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/AbstractEngineRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/AbstractEngineRunner.java
new file mode 100644
index 0000000..ba8a140
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/AbstractEngineRunner.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class AbstractEngineRunner {
+       private List<File> queue;
+       private final static String reviewSource = 
"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/opinions/macbook_pro.txt";
+       NamedEntityExtractor neExtractor = new NamedEntityExtractor();
+       
+       public void processJSONfileWithReviews(){
+               List<String[]> report = new ArrayList<String[]>();
+               report.add(new String[] { "text", "phrases of potential 
interest list" , });
+
+               
+               String content=null;
+               try {
+                       content = FileUtils.readFileToString(new 
File(reviewSource));
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+               String[] texts = StringUtils.substringsBetween(content, 
"reviewText\": \"", "\", \"overall");
+               for(String text: texts){
+                       EntityExtractionResult result = 
neExtractor.extractEntities(text);
+                       report.add(new String[]{text});
+                       
//report.add((String[])result.extractedNERWords.toArray(new String[0]));
+                       
//report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
+                       List<String> stringPhrases = new ArrayList<String>(),
+                                       nodePhrases = new ArrayList<String>();
+                       for(List<ParseTreeNode> chList: 
result.extractedSentimentPhrases){
+                               String buf = "", nodeBuf="";
+                               for(ParseTreeNode ch: chList){
+                                       buf+=ch.getWord()+ " ";
+                                       nodeBuf+=ch.toString()+ " ";
+                               }
+                               stringPhrases.add(buf.trim());
+                               nodePhrases.add(nodeBuf.trim());
+                       }
+                       report.add((String[])stringPhrases.toArray(new 
String[0]));
+                       report.add((String[])nodePhrases.toArray(new 
String[0]));
+                       report.add(new 
String[]{"-----------------------------"});
+                       ProfileReaderWriter.writeReport(report, 
"nameEntitiesTopicsOfInterestExtracted.csv");
+               }
+       }
+
+       // this func collects files 
+               private void addFiles(File file) {
+
+                       if (!file.exists()) {
+                               System.out.println(file + " does not exist.");
+                       }
+                       if (file.isDirectory()) {
+                               for (File f : file.listFiles()) {
+                                       if (f.getName().startsWith("."))
+                                               continue;
+                                       addFiles(f);
+                                       System.out.println(f.getName());
+                               }
+                       } else {
+                               queue.add(file);
+
+                       }
+               }
+       
+       public static void main(String[] args){
+               AbstractEngineRunner runner = new AbstractEngineRunner();
+               runner.processJSONfileWithReviews();
+
+       }
+}
+
+/*
+       public void processDirectory(String path){
+               List<String[]> report = new ArrayList<String[]>();
+               report.add(new String[] { "filename", "named entity list", 
"phrases of potential interest list" });
+
+               List<String> allNamedEntities = new ArrayList<String>();
+
+               addFiles(new File(path));
+               for(File f: queue){
+                       List<String> entities = (List<String>) 
extractEntities(f.getAbsolutePath()).getFirst();
+                       List<String> opinions = (List<String>) 
extractEntities(f.getAbsolutePath()).getSecond();
+                       report.add(new String[]{ f.getName(), 
entities.toString(),  opinions.toString()});      
+                       ProfileReaderWriter.writeReport(report, 
"nameEntitiesExtracted.csv");
+
+                       allNamedEntities.addAll(entities);
+
+                       allNamedEntities = new ArrayList<String>(new 
HashSet<String> (allNamedEntities ));
+
+
+               }
+               ProfileReaderWriter.writeReport(report, 
"nameEntitiesTopicsOfInterestExtracted.csv");
+       } 
+} */

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/DefaultSentimentProcessor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/DefaultSentimentProcessor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/DefaultSentimentProcessor.java
new file mode 100644
index 0000000..44a3640
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/DefaultSentimentProcessor.java
@@ -0,0 +1,523 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.IOException;
+import java.util.List;
+
+import edu.stanford.nlp.util.logging.Redwood;
+
+import java.util.Iterator;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.FileOutputStream;
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Properties;
+import java.util.logging.Logger;
+
+import org.ejml.simple.SimpleMatrix;
+
+import edu.stanford.nlp.io.IOUtils;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.Label;
+import edu.stanford.nlp.ling.LabeledWord;
+import edu.stanford.nlp.ling.TaggedWord;
+import edu.stanford.nlp.ling.WordLemmaTag;
+import edu.stanford.nlp.ling.WordTag;
+import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import 
edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree;
+import edu.stanford.nlp.sentiment.SentimentUtils;
+import edu.stanford.nlp.trees.MemoryTreebank;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreeCoreAnnotations;
+import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.Generics;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.util.CoreMap;
+
+public class DefaultSentimentProcessor {
+       /** A logger for this class */
+       private static final Logger log = Logger
+                       
.getLogger("opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor");
+
+       private static final NumberFormat NF = new DecimalFormat("0.0000");
+
+       enum Output {
+               PENNTREES, VECTORS, ROOT, PROBABILITIES
+       }
+
+       enum Input {
+               TEXT, TREES
+       }
+
+       /**
+        * Sets the labels on the tree (except the leaves) to be the integer
+        * value of the sentiment prediction.  Makes it easy to print out
+        * with Tree.toString()
+        */
+       static void setSentimentLabels(Tree tree) {
+               if (tree.isLeaf()) {
+                       return;
+               }
+
+               for (Tree child : tree.children()) {
+                       setSentimentLabels(child);
+               }
+
+               Label label = tree.label();
+               if (!(label instanceof CoreLabel)) {
+                       throw new IllegalArgumentException("Required a tree 
with CoreLabels");
+               }
+               CoreLabel cl = (CoreLabel) label;
+               
cl.setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree)));
+       }
+
+       /**
+        * Sets the labels on the tree to be the indices of the nodes.
+        * Starts counting at the root and does a postorder traversal.
+        */
+       static int setIndexLabels(Tree tree, int index) {
+               if (tree.isLeaf()) {
+                       return index;
+               }
+
+               tree.label().setValue(Integer.toString(index));
+               index++;
+               for (Tree child : tree.children()) {
+                       index = setIndexLabels(child, index);
+               }
+               return index;
+       }
+
+       /**
+        * Outputs the vectors from the tree.  Counts the tree nodes the
+        * same as setIndexLabels.
+        */
+       static int outputTreeVectors(PrintStream out, Tree tree, int index) {
+               if (tree.isLeaf()) {
+                       return index;
+               }
+
+               out.print("  " + index + ":");
+               SimpleMatrix vector = RNNCoreAnnotations.getNodeVector(tree);
+               for (int i = 0; i < vector.getNumElements(); ++i) {
+                       out.print("  " + NF.format(vector.get(i)));
+               }
+               out.println();
+               index++;
+               for (Tree child : tree.children()) {
+                       index = outputTreeVectors(out, child, index);
+               }
+               return index;
+       }
+
+       /**
+        * Outputs the scores from the tree.  Counts the tree nodes the
+        * same as setIndexLabels.
+        */
+       static int outputTreeScores(PrintStream out, Tree tree, int index) {
+               if (tree.isLeaf()) {
+                       return index;
+               }
+
+               out.print("  " + index + ":");
+               SimpleMatrix vector = RNNCoreAnnotations.getPredictions(tree);
+               for (int i = 0; i < vector.getNumElements(); ++i) {
+                       out.print("  " + NF.format(vector.get(i)));
+               }
+               out.println();
+               index++;
+               for (Tree child : tree.children()) {
+                       index = outputTreeScores(out, child, index);
+               }
+               return index;
+       }
+
+       public static <T> String wordToString(T o, final boolean justValue) {
+               return wordToString(o, justValue, null);
+       }
+
+       public static <T> String wordToString(T o, final boolean justValue,
+                       final String separator) {
+               if (justValue && o instanceof Label) {
+                       if (o instanceof CoreLabel) {
+                               CoreLabel l = (CoreLabel) o;
+                               String w = l.value();
+                               if (w == null)
+                                       w = l.word();
+                               return w;
+                       } else {
+                               return (((Label) o).value());
+                       }
+               } else if (o instanceof CoreLabel) {
+                       CoreLabel l = ((CoreLabel) o);
+                       String w = l.value();
+                       if (w == null)
+                               w = l.word();
+                       if (l.tag() != null) {
+                               if (separator == null) {
+                                       return w + CoreLabel.TAG_SEPARATOR + 
l.tag();
+                               } else {
+                                       return w + separator + l.tag();
+                               }
+                       }
+                       return w;
+                       // an interface that covered these next four cases 
would be
+                       // nice, but we're moving away from these data types 
anyway
+               } else if (separator != null && o instanceof TaggedWord) {
+                       return ((TaggedWord) o).toString(separator);
+               } else if (separator != null && o instanceof LabeledWord) {
+                       return ((LabeledWord) o).toString();
+               } else if (separator != null && o instanceof WordLemmaTag) {
+                       return ((WordLemmaTag) o).toString(separator);
+               } else if (separator != null && o instanceof WordTag) {
+                       return ((WordTag) o).toString(separator);
+               } else {
+                       return (o.toString());
+               }
+       }
+
+
+       /**
+        * Returns the sentence as a string with a space between words.
+        * It prints out the {@code value()} of each item -
+        * this will give the expected answer for a short form representation
+        * of the "sentence" over a range of cases.  It is equivalent to
+        * calling {@code toString(true)}.
+        *
+        * TODO: Sentence used to be a subclass of ArrayList, with this
+        * method as the toString.  Therefore, there may be instances of
+        * ArrayList being printed that expect this method to be used.
+        *
+        * @param list The tokenized sentence to print out
+        * @return The tokenized sentence as a String
+        */
+       public static <T> String listToString(List<T> list) {
+               return listToString(list, true);
+       }
+       /**
+        * Returns the sentence as a string with a space between words.
+        * Designed to work robustly, even if the elements stored in the
+        * 'Sentence' are not of type Label.
+        *
+        * This one uses the default separators for any word type that uses
+        * separators, such as TaggedWord.
+        *
+        * @param list The tokenized sentence to print out
+        * @param justValue If {@code true} and the elements are of type
+        *                  {@code Label}, return just the
+        *                  {@code value()} of the {@code Label} of each word;
+        *                  otherwise,
+        *                  call the {@code toString()} method on each item.
+        * @return The sentence in String form
+        */
+       public static <T> String listToString(List<T> list, final boolean 
justValue) {
+               return listToString(list, justValue, null);
+       }
+
+       /**
+        * As already described, but if separator is not null, then objects
+        * such as TaggedWord
+        *
+        * @param separator The string used to separate Word and Tag
+        *                  in TaggedWord, etc
+        */
+       public static <T> String listToString(List<T> list, final boolean 
justValue,
+                       final String separator) {
+               StringBuilder s = new StringBuilder();
+               for (Iterator<T> wordIterator = list.iterator(); 
wordIterator.hasNext();) {
+                       T o = wordIterator.next();
+                       s.append(wordToString(o, justValue, separator));
+                       if (wordIterator.hasNext()) {
+                               s.append(' ');
+                       }
+               }
+               return s.toString();
+       }
+
+       /**
+        * Outputs a tree using the output style requested
+        */
+       static void outputTree(PrintStream out, CoreMap sentence, List<Output> 
outputFormats) {
+               Tree tree = 
sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
+               for (Output output : outputFormats) {
+                       switch (output) {
+                       case PENNTREES: {
+                               Tree copy = tree.deepCopy();
+                               setSentimentLabels(copy);
+                               out.println(copy);
+                               break;
+                       }
+                       case VECTORS: {
+                               Tree copy = tree.deepCopy();
+                               setIndexLabels(copy, 0);
+                               out.println(copy);
+                               outputTreeVectors(out, tree, 0);
+                               break;
+                       }
+                       case ROOT: {
+                               out.println("  " + 
sentence.get(SentimentCoreAnnotations.SentimentClass.class));
+                               break;
+                       }
+                       case PROBABILITIES: {
+                               Tree copy = tree.deepCopy();
+                               setIndexLabels(copy, 0);
+                               out.println(copy);
+                               outputTreeScores(out, tree, 0);
+                               break;
+                       }
+                       default:
+                               throw new IllegalArgumentException("Unknown 
output format " + output);
+                       }
+               }
+       }
+
+       /**
+        * Reads an annotation from the given filename using the requested 
input.
+        */
+       public static List<Annotation> getAnnotations(StanfordCoreNLP 
tokenizer, Input inputFormat, String filename, boolean filterUnknown) {
+               switch (inputFormat) {
+               case TEXT: {
+                       String text = IOUtils.slurpFileNoExceptions(filename);
+                       Annotation annotation = new Annotation(text);
+                       tokenizer.annotate(annotation);
+                       List<Annotation> annotations = Generics.newArrayList();
+                       for (CoreMap sentence : 
annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
+                               Annotation nextAnnotation = new 
Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
+                               
nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, 
Collections.singletonList(sentence));
+                               annotations.add(nextAnnotation);
+                       }
+                       return annotations;
+               }
+               case TREES: {
+                       List<Tree> trees;
+                       if (filterUnknown) {
+                               trees = 
SentimentUtils.readTreesWithGoldLabels(filename);
+                               trees = 
SentimentUtils.filterUnknownRoots(trees);
+                       } else {
+                               trees = Generics.newArrayList();
+                               MemoryTreebank treebank = new 
MemoryTreebank("utf-8");
+                               treebank.loadPath(filename, null);
+                               for (Tree tree : treebank) {
+                                       trees.add(tree);
+                               }
+                       }
+
+                       List<Annotation> annotations = Generics.newArrayList();
+                       for (Tree tree : trees) {
+                               CoreMap sentence = new 
Annotation(listToString(tree.yield()));
+                               
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
+                               List<CoreMap> sentences = 
Collections.singletonList(sentence);
+                               Annotation annotation = new Annotation("");
+                               
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
+                               annotations.add(annotation);
+                       }
+                       return annotations;
+               }
+               default:
+                       throw new IllegalArgumentException("Unknown format " + 
inputFormat);
+               }
+       }
+
+       /** Runs the tree-based sentiment model on some text. */
+       public void processTextWithArgs(String[] args) throws IOException {
+               String parserModel = null;
+               String sentimentModel = null;
+
+               String filename = null;
+               String fileList = null;
+               boolean stdin = false;
+
+               boolean filterUnknown = false;
+
+               List<Output> outputFormats = 
Collections.singletonList(Output.ROOT);
+               Input inputFormat = Input.TEXT;
+
+               String tlppClass = "DEFAULT_TLPP_CLASS";
+
+               for (int argIndex = 0; argIndex < args.length; ) {
+                       if (args[argIndex].equalsIgnoreCase("-sentimentModel")) 
{
+                               sentimentModel = args[argIndex + 1];
+                               argIndex += 2;
+                       } else if 
(args[argIndex].equalsIgnoreCase("-parserModel")) {
+                               parserModel = args[argIndex + 1];
+                               argIndex += 2;
+                       } else if (args[argIndex].equalsIgnoreCase("-file")) {
+                               filename = args[argIndex + 1];
+                               argIndex += 2;
+                       } else if 
(args[argIndex].equalsIgnoreCase("-fileList")) {
+                               fileList = args[argIndex + 1];
+                               argIndex += 2;
+                       } else if (args[argIndex].equalsIgnoreCase("-stdin")) {
+                               stdin = true;
+                               argIndex++;
+                       } else if (args[argIndex].equalsIgnoreCase("-input")) {
+                               inputFormat = Input.valueOf(args[argIndex + 
1].toUpperCase());
+                               argIndex += 2;
+                       } else if (args[argIndex].equalsIgnoreCase("-output")) {
+                               String[] formats = args[argIndex + 
1].split(",");
+                               outputFormats = new ArrayList<>();
+                               for (String format : formats) {
+                                       
outputFormats.add(Output.valueOf(format.toUpperCase()));
+                               }
+                               argIndex += 2;
+                       } else if 
(args[argIndex].equalsIgnoreCase("-filterUnknown")) {
+                               filterUnknown = true;
+                               argIndex++;
+                       } else if 
(args[argIndex].equalsIgnoreCase("-tlppClass")) {
+                               tlppClass = args[argIndex + 1];
+                               argIndex += 2;
+                       } else if (args[argIndex].equalsIgnoreCase("-help")) {
+                               System.exit(0);
+                       } else {
+                               log.info("Unknown argument " + args[argIndex + 
1]);
+                               throw new IllegalArgumentException("Unknown 
argument " + args[argIndex + 1]);
+                       }
+               }
+
+               // We construct two pipelines.  One handles tokenization, if
+               // necessary.  The other takes tokenized sentences and converts
+               // them to sentiment trees.
+               Properties pipelineProps = new Properties();
+               Properties tokenizerProps = null;
+               if (sentimentModel != null) {
+                       pipelineProps.setProperty("sentiment.model", 
sentimentModel);
+               }
+               if (parserModel != null) {
+                       pipelineProps.setProperty("parse.model", parserModel);
+               }
+               if (inputFormat == Input.TREES) {
+                       pipelineProps.setProperty("annotators", "binarizer, 
sentiment");
+                       
pipelineProps.setProperty("customAnnotatorClass.binarizer", 
"edu.stanford.nlp.pipeline.BinarizerAnnotator");
+                       pipelineProps.setProperty("binarizer.tlppClass", 
tlppClass);
+                       pipelineProps.setProperty("enforceRequirements", 
"false");
+               } else {
+                       pipelineProps.setProperty("annotators", "parse, 
sentiment");
+                       pipelineProps.setProperty("enforceRequirements", 
"false");
+                       tokenizerProps = new Properties();
+                       tokenizerProps.setProperty("annotators", "tokenize, 
ssplit");
+               }
+
+               if (stdin && tokenizerProps != null) {
+                       
tokenizerProps.setProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "true");
+               }
+
+               int count = 0;
+               if (filename != null) count++;
+               if (fileList != null) count++;
+               if (stdin) count++;
+               if (count > 1) {
+                       throw new IllegalArgumentException("Please only specify 
one of -file, -fileList or -stdin");
+               }
+               if (count == 0) {
+                       throw new IllegalArgumentException("Please specify 
either -file, -fileList or -stdin");
+               }
+
+               StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : 
new StanfordCoreNLP(tokenizerProps);
+               StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps);
+
+               if (filename != null) {
+                       // Process a file.  The pipeline will do tokenization, 
which
+                       // means it will split it into sentences as best as 
possible
+                       // with the tokenizer.
+                       List<Annotation> annotations = 
getAnnotations(tokenizer, inputFormat, filename, filterUnknown);
+                       for (Annotation annotation : annotations) {
+                               pipeline.annotate(annotation);
+
+                               for (CoreMap sentence : 
annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
+                                       System.out.println(sentence);
+                                       outputTree(System.out, sentence, 
outputFormats);
+                               }
+                       }
+               } else if (fileList != null) {
+                       // Process multiple files.  The pipeline will do 
tokenization,
+                       // which means it will split it into sentences as best 
as
+                       // possible with the tokenizer.  Output will go to 
filename.out
+                       // for each file.
+                       for (String file : fileList.split(",")) {
+                               List<Annotation> annotations = 
getAnnotations(tokenizer, inputFormat, file, filterUnknown);
+                               FileOutputStream fout = new 
FileOutputStream(file + ".out");
+                               PrintStream pout = new PrintStream(fout);
+                               for (Annotation annotation : annotations) {
+                                       pipeline.annotate(annotation);
+
+                                       for (CoreMap sentence : 
annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
+                                               pout.println(sentence);
+                                               outputTree(pout, sentence, 
outputFormats);
+                                       }
+                               }
+                               pout.flush();
+                               fout.close();
+                       }
+               } else {
+                       // Process stdin.  Each line will be treated as a 
single sentence.
+                       log.info("Reading in text from stdin.");
+                       log.info("Please enter one sentence per line.");
+                       log.info("Processing will end when EOF is reached.");
+                       BufferedReader reader = 
IOUtils.readerFromStdin("utf-8");
+
+                       for (String line; (line = reader.readLine()) != null; ) 
{
+                               line = line.trim();
+                               if ( ! line.isEmpty()) {
+                                       Annotation annotation = 
tokenizer.process(line);
+                                       pipeline.annotate(annotation);
+                                       for (CoreMap sentence : 
annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
+                                               outputTree(System.out, 
sentence, outputFormats);
+                                       }
+                               } else {
+                                       // Output blank lines for blank lines 
so the tool can be
+                                       // used for line-by-line text processing
+                                       System.out.println();
+                               }
+                       }
+
+               }
+       }
+
+       public float getNumericSentimentValue(String expression) {
+               Properties props = new Properties();
+               props.setProperty("annotators", "tokenize, ssplit, parse, 
sentiment");
+               StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
+               int mainSentiment = 0;
+               if (expression != null && expression.length() > 0) {
+                       int longest = 0;
+                       Annotation annotation = pipeline.process(expression);
+                       for (CoreMap sentence : 
annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
+                               Tree tree = 
sentence.get(SentimentAnnotatedTree.class);
+                               int sentiment = 
RNNCoreAnnotations.getPredictedClass(tree);
+                               String partText = sentence.toString();
+                               if (partText.length() > longest) {
+                                       mainSentiment = sentiment;
+                                       longest = partText.length();
+                               }
+                       }
+               }
+               return mainSentiment;
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/EntityExtractionResult.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/EntityExtractionResult.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/EntityExtractionResult.java
new file mode 100644
index 0000000..69eae1d
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/EntityExtractionResult.java
@@ -0,0 +1,158 @@
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.util.List;
+
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.similarity.apps.HitBase;
+
+public class EntityExtractionResult {
+       List<List<ParseTreeNode>> extractedNERs;
+       public List<String> extractedNERWords;
+       // phrases w/sentiments
+       public List<List<ParseTreeNode>> extractedSentimentPhrases;
+       public List<String> extractedSentimentPhrasesStr;
+       // phrases w/o sentiments
+       public List<List<ParseTreeNode>> extractedNONSentimentPhrases;
+       public List<String> extractedNONSentimentPhrasesStr;
+       public List<Float> sentimentProfile;
+       
+       
+       public List<String> getExtractedSentimentPhrasesStr() {
+               return extractedSentimentPhrasesStr;
+       }
+
+       public void setExtractedSentimentPhrasesStr(List<String> 
extractedSentimentPhrasesStr) {
+               this.extractedSentimentPhrasesStr = 
extractedSentimentPhrasesStr;
+       }
+       /*
+        * Licensed to the Apache Software Foundation (ASF) under one or more
+        * contributor license agreements.  See the NOTICE file distributed with
+        * this work for additional information regarding copyright ownership.
+        * The ASF licenses this file to You under the Apache License, Version 
2.0
+        * (the "License"); you may not use this file except in compliance with
+        * the License. You may obtain a copy of the License at
+        *
+        *     http://www.apache.org/licenses/LICENSE-2.0
+        *
+        * Unless required by applicable law or agreed to in writing, software
+        * distributed under the License is distributed on an "AS IS" BASIS,
+        * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
+        * See the License for the specific language governing permissions and
+        * limitations under the License.
+        */
+       public List<List<ParseTreeNode>> getExtractedNONSentimentPhrases() {
+               return extractedNONSentimentPhrases;
+       }
+
+       public void setExtractedNONSentimentPhrases(List<List<ParseTreeNode>> 
extractedNONSentimentPhrases) {
+               this.extractedNONSentimentPhrases = 
extractedNONSentimentPhrases;
+       }
+
+       public List<String> getExtractedNONSentimentPhrasesStr() {
+               return extractedNONSentimentPhrasesStr;
+       }
+
+       public void setExtractedNONSentimentPhrasesStr(List<String> 
extractedNONSentimentPhrasesStr) {
+               this.extractedNONSentimentPhrasesStr = 
extractedNONSentimentPhrasesStr;
+       }
+
+       public List<HitBase> hits;
+       private List<List<ParseTreeNode>> extractedNerPhrases;
+       private List<String> extractedNerPhrasesStr;
+       private List<String> extractedNerPhraseTags;
+       private List<List<ParseTreeNode>> extractedNerExactPhrases;
+       private List<String> extractedNerExactStr;
+
+       public void setExtractedNERWords(List<String> extractedNERWords) {
+               this.extractedNERWords = extractedNERWords;
+       }
+
+       public void setExtractedSentimentPhrases(List<List<ParseTreeNode>> 
extractedSentimentPhrases) {
+               this.extractedSentimentPhrases = extractedSentimentPhrases;
+       }
+
+       public void setExtractedNER(List<List<ParseTreeNode>> extractedNERs) {
+               this.extractedNERs = extractedNERs;
+       }
+
+       public void setGossipHits(List<HitBase> hitsForAnEntity) {
+               hits = hitsForAnEntity;
+       }
+
+       public List<List<ParseTreeNode>> getExtractedNERs() {
+               return extractedNERs;
+       }
+
+       public void setExtractedNERs(List<List<ParseTreeNode>> extractedNERs) {
+               this.extractedNERs = extractedNERs;
+       }
+
+       public List<HitBase> getHits() {
+               return hits;
+       }
+
+       public void setHits(List<HitBase> hits) {
+               this.hits = hits;
+       }
+
+       public List<String> getExtractedNERWords() {
+               return extractedNERWords;
+       }
+
+       public List<List<ParseTreeNode>> getExtractedSentimentPhrases() {
+               return extractedSentimentPhrases;
+       }
+
+       public void setSentimentProfile(List<Float> sentimentProfile) {
+           this.sentimentProfile = sentimentProfile;
+    }
+
+       public List<Float> getSentimentProfile() {
+               return sentimentProfile;
+       }
+
+       public void setExtractedNerPhrases(List<List<ParseTreeNode>> 
extractedNerPhrases) {
+           this.extractedNerPhrases = extractedNerPhrases;
+           
+    }
+
+       public void setExtractedNerPhrasesStr(List<String> 
extractedNerPhrasesStr) {
+           this.extractedNerPhrasesStr = extractedNerPhrasesStr;
+           
+    }
+
+       public List<List<ParseTreeNode>> getExtractedNerPhrases() {
+               return extractedNerPhrases;
+       }
+
+       public List<String> getExtractedNerPhrasesStr() {
+               return extractedNerPhrasesStr;
+       }
+
+       public void setExtractedNerPhraseTags(List<String> 
extractedNerPhraseTags) {
+           this.extractedNerPhraseTags = extractedNerPhraseTags;           
+    }
+
+       public List<String> getExtractedNerPhraseTags() {
+           return this.extractedNerPhraseTags;    
+    }
+
+       public void setExtractedNerExactPhrases(List<List<ParseTreeNode>> 
extractedNerExactPhrases) {
+          this.extractedNerExactPhrases = extractedNerExactPhrases;
+           
+    }
+
+       public void setExtractedNerExactStr(List<String> extractedNerExactStr) {
+           this.extractedNerExactStr = extractedNerExactStr;
+           
+    }
+
+       public List<List<ParseTreeNode>> getExtractedNerExactPhrases() {
+               return extractedNerExactPhrases;
+       }
+
+       public List<String> getExtractedNerExactStr() {
+               return extractedNerExactStr;
+       }
+       
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/ExpressionSentimentAnalyzer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/ExpressionSentimentAnalyzer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/ExpressionSentimentAnalyzer.java
new file mode 100644
index 0000000..dc89d8b
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/ExpressionSentimentAnalyzer.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.util.Properties;
+
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
+import 
edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.util.CoreMap;
+
+public class ExpressionSentimentAnalyzer {
+       float findSentiment(String line) {
+               Properties props = new Properties();
+               props.setProperty("annotators", "tokenize, ssplit, parse, 
sentiment");
+               StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
+               int mainSentiment = 0;
+               if (line != null && line.length() > 0) {
+                       int longest = 0;
+                       Annotation annotation = pipeline.process(line);
+                       for (CoreMap sentence : 
annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
+                               Tree tree = 
sentence.get(SentimentAnnotatedTree.class);
+                               int sentiment = 
RNNCoreAnnotations.getPredictedClass(tree);
+                               String partText = sentence.toString();
+                               if (partText.length() > longest) {
+                                       mainSentiment = sentiment;
+                                       longest = partText.length();
+                               }
+                       }
+               }
+               return mainSentiment;
+       }
+
+       public static void main(String[] args) {
+               float sent = new 
ExpressionSentimentAnalyzer().findSentiment("poor president nomee Hilary 
Clinton visited Mexico");
+               System.out.println(sent);
+       }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
new file mode 100644
index 0000000..0f53ec5
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java
@@ -0,0 +1,591 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.opinion_processor;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import org.apache.commons.lang3.StringUtils;
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.similarity.apps.utils.ValueSortMap;
+import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+public class LinguisticPhraseManager {
+       private Map<String, Integer> freq = new ConcurrentHashMap<String, 
Integer>();
+       
+       // the purpose to init this static object is to show the path to 
resources
+       private static StopList stop = StopList.getInstance(new 
File(".").getAbsolutePath().replace(".","")+ "src/test/resources/");
+
+       // this list will be overwritten by the external synonyms.csv
+       private static String[][] synonymPairs = new String[][]{};
+       private PStemmer stemmer = new PStemmer();
+
+       private List<ParseTreeChunk> lingPhrases = new 
ArrayList<ParseTreeChunk>();
+       private List<String> standardizedTopics = new ArrayList<String>();
+       // map which shows for each ling phrase the list of ling phrases with 
the same head noun it belongs
+       private Map<ParseTreeChunk, List<ParseTreeChunk>> entry_group = new 
ConcurrentHashMap<ParseTreeChunk, List<ParseTreeChunk>>();
+
+       //  map which shows for each string phrase the list of ling phrases 
with the same head noun it belongs
+       private Map<String, List<ParseTreeChunk>> std_group = new 
ConcurrentHashMap<String, List<ParseTreeChunk>>();
+
+       private BingQueryRunner runner = new BingQueryRunner();
+       private static final int MIN_NUMBER_OF_PHRASES_TO_CONSIDER = 3;//2; 5
+       private static final int MIN_LENGTH_OF_WORD_TO_CONSIDER = 3;
+       // this function takes a log of a chain of the nodes of parse trees and 
builds their instances
+       // the phrases should only be VP or NP, otherwise an exception should 
be thrown
+       
+       
+
+       private String resourceDir;
+       public LinguisticPhraseManager(){
+               try {
+                       resourceDir  = new File( "." 
).getCanonicalPath()+"/src/main/resources/";
+                       List<String[]> vocabs = 
ProfileReaderWriter.readProfiles(resourceDir+"/synonyms.csv");
+                       synonymPairs = new String[vocabs.size()][2];
+                       int count = 0;
+                       for(String[] line: vocabs){
+                               try {
+                                       synonymPairs[count] = line;
+                                       count++;
+                   } catch (Exception e) {
+                       e.printStackTrace();
+                   }
+                       }
+                       
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }               
+       }
+
+       private ParseTreeChunk parseLingPhraseIntoParseTreeChunk(String phrStr){
+               ParseTreeChunk ch = new ParseTreeChunk();
+               List<String> POSs = new ArrayList<String>(), lemmas = new 
ArrayList<String>();
+
+               String[] parts = phrStr.replace("]","").split(", <");
+
+               ch.setMainPOS( StringUtils.substringBetween(phrStr, ">", "'"));
+               try {
+                       for(String part: parts){
+                               String lemma = 
StringUtils.substringBetween(part, "P'", "':").toLowerCase();
+                               String pos = 
part.substring(part.indexOf(":")+1, part.length());
+
+                               if (pos==null || lemma ==null){
+                                       continue;
+                               }
+                               POSs.add(pos.trim());
+                               lemmas.add(lemma.trim());
+                               ch.setPOSs(POSs); ch.setLemmas(lemmas);
+                       }
+               } catch (Exception e) {
+                       // we expect exceptions if extracted phrases are 
NEITHER NP nor VP
+                       // empty chunk will be given which will not create a 
new topic
+                       e.printStackTrace();
+               }
+
+               return ch;
+       }
+
+       // this is a constructor with an array of extraction files
+       // optimized for performance
+       // only topics occurring more than MIN_NUMBER_OF_PHRASES_TO_CONSIDER 
times will be considered
+       public LinguisticPhraseManager(String[] loadPaths){
+               List<String[]> columns = new ArrayList<String[]>();
+               for(String file: loadPaths){
+                       columns.addAll(ProfileReaderWriter.readProfiles( file));
+               }
+
+               for(String[] l: columns){
+                       if (l.length<3 || l[1]==null || l[2]==null)
+                               continue;
+                       String word = l[1].toLowerCase().trim();
+                       if (word.indexOf("=>")>-1)
+                               continue;
+
+                       word = isAcceptableStringPhrase(word);
+                       if (word==null)
+                               continue;
+
+                       if (!freq.containsKey(word)) {
+                               freq.put(word, 1);
+
+                       } else {
+                               freq.put(word, freq.get(word) + 1);
+                               // once we reached the count for a topic, 
create it
+                               if 
(freq.get(word)==MIN_NUMBER_OF_PHRASES_TO_CONSIDER){
+                                       ParseTreeChunk ch = 
parseLingPhraseIntoParseTreeChunk(l[2]);
+                                       ch = isAcceptableLingPhrase(ch);
+                                       if (ch==null)
+                                               continue;
+                                       lingPhrases.add(ch);
+                               }
+                       }                 
+               }
+               // we dont need frequency data any more
+               freq.clear();
+       }
+
+       // this is a default constructor with a single topic extraction file
+       // not optimized for performance
+       public LinguisticPhraseManager(String loadPath){
+               List<String[]> columns = ProfileReaderWriter.readProfiles( 
loadPath);
+               for(String[] l: columns){
+                       if (l.length<3 || l[1]==null || l[2]==null)
+                               continue;
+                       String word = l[1].toLowerCase().trim();
+                       if (word.indexOf("=>")>-1)
+                               continue;
+
+                       word = isAcceptableStringPhrase(word);
+                       if (word==null)
+                               continue;
+
+                       if (!freq.containsKey(word)) {
+
+                               ParseTreeChunk ch = 
parseLingPhraseIntoParseTreeChunk(l[2]);
+                               ch = isAcceptableLingPhrase(ch);
+                               if (ch==null)
+                                       continue;
+                               freq.put(word, 1);
+                               lingPhrases.add(ch);
+                       } else {
+                               freq.put(word, freq.get(word) + 1);
+                       }                 
+
+
+               }
+               freq = ValueSortMap.sortMapByValue(freq, false);
+
+
+       }
+       // removing prepositions and articles in case it has not worked at 
phrase forming stage
+       private String isAcceptableStringPhrase(String word) {
+               if (word.startsWith("to "))
+                       return null;
+               if (word.startsWith("a "))
+                       return word.substring(2, word.length());
+
+               if (word.endsWith(" !") || word.endsWith(" ."))
+                       return word.substring(0, word.length()-2).trim();
+
+               return word;
+       }
+       // we only accept NP 
+       private ParseTreeChunk isAcceptableLingPhrase(ParseTreeChunk ch) {
+               if (!ch.getMainPOS().equals("NP"))
+                       return null;
+
+
+               return ch;
+       }
+
+       // groups are sets of phrases with the same head noun
+       // put all phrases in a group. Have a map from each phrase to its 
group: the list of members
+       public void doLingGrouping(){
+               for(int i=0; i< lingPhrases.size(); i++){
+                       for(int j=i+1; j< lingPhrases.size(); j++){
+                               ParseTreeChunk chI = lingPhrases.get(i);
+                               ParseTreeChunk chJ = lingPhrases.get(j);
+                               if 
(chI.getLemmas().get(chI.getLemmas().size()-1).equals(chJ.getLemmas().get(chJ.getLemmas().size()-1))
+                                               && 
chI.getPOSs().get(chI.getLemmas().size()-1).startsWith("NN") ){
+                                       List<ParseTreeChunk> values = null; 
+                                       if( 
chI.getLemmas().size()<chJ.getLemmas().size()){             
+
+                                               if (values == null)
+                                                       values = new 
ArrayList<ParseTreeChunk>();
+                                               values.add(chI);
+                                               entry_group.put(chJ, values);
+                                       } else {
+                                               values = entry_group.get(chI);
+                                               if (values == null)
+                                                       values = new 
ArrayList<ParseTreeChunk>();
+                                               values.add(chJ);
+                                               entry_group.put(chI, values);
+                                       }
+                               }
+                       }
+               }
+
+
+       }
+
+       public List<String> formStandardizedTopic(){
+               Set<ParseTreeChunk> keys = entry_group.keySet();
+               for(ParseTreeChunk k: keys){
+                       List<ParseTreeChunk> lingPhrases = entry_group.get(k);  
        
+                       for(int i=0; i< lingPhrases.size(); i++)
+                               for(int j=i+1; j< lingPhrases.size(); j++){
+                                       ParseTreeChunk chI = lingPhrases.get(i);
+                                       ParseTreeChunk chJ = lingPhrases.get(j);
+                                       List<String> lemmas = new 
ArrayList<String>(chI.getLemmas());
+                                       lemmas.retainAll(chJ.getLemmas());
+                                       if (lemmas.size()<2)
+                                               continue;
+                                       String buf = ""; List<String> 
candTopicLst = new ArrayList<String>();
+                                       for(String w: lemmas){
+                                               if 
(w.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER)
+                                                       continue;
+                                               if (!StringUtils.isAlpha(w))
+                                                       continue;
+                                               // find POS of w
+                                               boolean bAccept = false;
+                                               for(int iw=0; 
iw<chI.getLemmas().size(); iw++){
+                                                       if 
(w.equals(chI.getLemmas().get(iw))){
+                                                               if 
(chI.getPOSs().get(iw).startsWith("NN") || 
chI.getPOSs().get(iw).startsWith("JJ")
+                                                                               
|| chI.getPOSs().get(iw).startsWith("VB"))
+                                                                       
bAccept=true;
+                                                       }
+                                               }
+                                               if (bAccept){
+                                                       //buf+=w+" ";
+                                                       String ws = 
substituteSynonym(w);
+                                                       candTopicLst.add(ws);
+                                               }
+                                       }
+                                       // remove duplicates like 'new new 
house'
+                                       //candTopicLst = new 
ArrayList<String>(new HashSet<String>(candTopicLst));
+                                       for(String w: candTopicLst){
+                                               buf+=w+" ";
+                                       }
+
+                                       buf = buf.trim();
+                                       if (buf.indexOf(' ')<0)
+                                               continue;
+
+                                       if (!standardizedTopics.contains(buf)){
+                                               standardizedTopics.add(buf);    
        
+                                               std_group.put(buf, lingPhrases);
+                                       }
+                               }
+               }
+               cleanUpStandardizedTopics();
+
+               return standardizedTopics;
+       }
+
+       public void cleanUpStandardizedTopics(){
+               List<String> toDelete = new ArrayList<String>();
+               for(int i=0; i< standardizedTopics.size(); i++)
+                       for(int j=i+1; j< standardizedTopics.size(); j++){
+                               List<String> t1 = 
TextProcessor.fastTokenize(standardizedTopics.get(i), false);
+                               List<String> t2 = 
TextProcessor.fastTokenize(standardizedTopics.get(j), false);
+                               for(int k=0; k< t1.size(); k++){
+                                       t1.set(k, stemmer.stem(t1.get(k)));
+                               }
+                               for(int k=0; k< t2.size(); k++){
+                                       t2.set(k, stemmer.stem(t2.get(k)));
+                               } 
+                               // check if lists are equal
+                               if (t1.size()!=t2.size())
+                                       continue;
+                               //if in two phrases once all keywords are 
tokenized, one phrase annihilates another, 
+                               t1.removeAll(t2);
+                               if (t1.isEmpty()){ 
+                                       if (standardizedTopics.get(i).length()> 
standardizedTopics.get(j).length()){
+                                               
toDelete.add(standardizedTopics.get(i));
+                                               // TODO update std_group entry
+                                               System.out.println("Removing '" 
+ standardizedTopics.get(i) + "' because of '" + standardizedTopics.get(j) );
+                                               List<ParseTreeChunk> stJ = 
std_group.get(standardizedTopics.get(j));
+                                               
stJ.addAll(std_group.get(standardizedTopics.get(i)));
+                                               stJ = new 
ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stJ));
+                                               
std_group.put(standardizedTopics.get(j), stJ);
+                                       }
+                                       else {
+                                               
toDelete.add(standardizedTopics.get(j));
+                                               System.out.println("Removing '" 
+ standardizedTopics.get(j) + "' because of '" + standardizedTopics.get(i) );
+                                               List<ParseTreeChunk> stI = 
std_group.get(standardizedTopics.get(i));
+                                               
stI.addAll(std_group.get(standardizedTopics.get(j)));
+                                               stI = new 
ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stI));
+                                               
std_group.put(standardizedTopics.get(i), stI);
+                                       }
+
+                               }
+                       }
+               for(String d: toDelete){
+                       //System.out.println("Removed '" + d + "'");
+                       standardizedTopics.remove(d);
+               }
+       }
+
+       // substitute synonyms according to internal vocab
+       private String substituteSynonym(String w) {
+               try {
+                       for(String[] pair: synonymPairs){
+                               if (w.equals(pair[0]))
+                                       return pair[1];
+                       }
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }
+               return w;
+       }
+
+       public void generateGroupingReport(String reportName){
+               List<String[]>  report = new ArrayList<String[]>();
+               Set<ParseTreeChunk> chs = entry_group.keySet();
+               report.add(new String[]{"string phrase" , "class", "linguistic 
phrase",  "list of ling phrases class representatives"});
+
+               for(ParseTreeChunk ch: chs){
+                       String head = 
ch.getLemmas().get(ch.getLemmas().size()-1);
+                       List<ParseTreeChunk> values = entry_group.get(ch);
+                       if (values.size()<6)
+                               head = "";
+                       report.add(new String[]{ch.toWordOnlyString(), head,  
ch.toString(),  values.toString()});
+               }
+               ProfileReaderWriter.writeReport(report, reportName);
+       }
+
+       //final merge floor-floors-flooring as head nound with phrase update
+       public void applyLastRoundOfAggregation(){
+               //merge <floor - floors - flooring>
+               /*
+                       List<ParseTreeChunk> entries =  new 
ArrayList<ParseTreeChunk>(entry_group.keySet());
+                       for(int i=0; i< entries.size(); i++){
+                               for(int j=i+1; j< entries.size(); j++){
+                                       ParseTreeChunk chI = entries.get(i);
+                                       ParseTreeChunk chJ = entries.get(j);
+                                       String headI = 
getLastElement(chI.getLemmas());
+                                       String headJ = 
getLastElement(chJ.getLemmas());
+                                       if (headI==null || 
headI.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER  || 
+                                                       headJ==null || 
headJ.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER )
+                                               continue;
+
+                                       if (headI.indexOf(headJ)>-1){
+                                               //leave headJ
+                                               List<ParseTreeChunk> 
valuesToAddTo = entry_group.get(chJ);
+                                               List<ParseTreeChunk> 
valuesBeingAdded = entry_group.get(chI);
+                                               if (valuesToAddTo==null || 
valuesBeingAdded == null)
+                                                       continue;
+                                               
valuesToAddTo.addAll(valuesBeingAdded);
+                                               entry_group.put(chJ, 
valuesToAddTo);
+                                               entry_group.remove(chI);
+                                               System.out.println("Deleting 
entry '"+ headI +"' and moving group to entry '"+ headJ +"'");
+                                       } else if (headJ.indexOf(headI)>-1){
+                                               //leave headJ
+                                               List<ParseTreeChunk> 
valuesToAddTo = entry_group.get(chI);
+                                               List<ParseTreeChunk> 
valuesBeingAdded = entry_group.get(chJ);
+                                               if (valuesToAddTo==null || 
valuesBeingAdded == null)
+                                                       continue;
+                                               
valuesToAddTo.addAll(valuesBeingAdded);
+                                               entry_group.put(chI, 
valuesToAddTo);
+                                               entry_group.remove(chJ);
+                                               System.out.println("Deleting 
entry '"+ headJ +"' and moving group to entry '"+ headI +"'");
+                                       }
+
+                               }
+                       }
+                */
+               for(int i = 0; i<standardizedTopics.size(); i++ )
+                       for(int j = i+1; j<standardizedTopics.size(); j++ ){
+                               String headI = 
extractHeadNounFromPhrase(standardizedTopics.get(i));
+                               String headJ = 
extractHeadNounFromPhrase(standardizedTopics.get(j));
+                               // if the same word do nothing
+                               if (headI.equals(headJ))
+                                       continue;
+
+                               //only if one is sub-word of another
+                               if (headI.indexOf(headJ)>-1){
+
+                                       if (!properSubWordForm(headI, headJ))
+                                               continue;
+                                       //entry 'I' will be updated
+                                       String newKey = 
standardizedTopics.get(i).replace(headI, headJ);
+
+                                       List<ParseTreeChunk> stI = 
std_group.get(standardizedTopics.get(i));
+                                       List<ParseTreeChunk> stInew = 
std_group.get(newKey);
+                                       //if (stInew!=null && !stInew.isEmpty())
+                                       //      stI.addAll(stInew);
+                                       if(stI==null)
+                                               continue;
+                                       std_group.put(newKey, stI);
+                                       
std_group.remove(standardizedTopics.get(i));
+                                       System.out.println("Deleted entry for 
key '"+ standardizedTopics.get(i) +"' and created  '"+ newKey +"'");
+                                       standardizedTopics.set(i, newKey);
+
+                               } else if (headJ.indexOf(headI)>-1){
+                                       if (!properSubWordForm(headJ, headI))
+                                               continue;
+                                       //entry 'J' will be updated
+                                       String newKey = 
standardizedTopics.get(j).replace(headJ, headI);
+
+                                       List<ParseTreeChunk> stJ = 
std_group.get(standardizedTopics.get(j));
+                                       List<ParseTreeChunk> stJnew = 
std_group.get(newKey);
+                                       //if (stJnew!=null && !stJnew.isEmpty())
+                                       //      stJ.addAll(stJnew);
+                                       if(stJ==null)
+                                               continue;
+                                       std_group.put(newKey, stJ);
+                                       
std_group.remove(standardizedTopics.get(j));
+                                       System.out.println("Deleted entry for 
key '"+ standardizedTopics.get(j) +"' and created  '"+ newKey +"'");
+                                       standardizedTopics.set(j, newKey);
+                               }
+                       }
+
+
+
+       }
+
+       private boolean properSubWordForm(String headI, String headJ) {
+               String suffix = headI.replace(headJ, "");
+               if (suffix.equals("s") || suffix.equals("ing") //|| 
suffix.equals("er") 
+                               || suffix.equals("rooms") ||
+                               suffix.equals("") || suffix.equals("counter") ||
+                               suffix.equals("room") || suffix.equals("back"))
+                       return true;
+
+               //System.out.println("Wrong word '"+ headI + "'reduction into 
'" + headJ +"'");
+               return false;
+       }
+
+       //generates report 
+       public void generateStdTopicReport(String reportName){
+               List<String[]>  report = new ArrayList<String[]>();
+               report.add(new String[]{"category", "topic", "sub-topics", 
"phrase instances" });
+
+               for(String t: standardizedTopics){
+
+                       String bufCover = "";
+                       int count = 0;
+                       List<ParseTreeChunk> ptcList = std_group.get(t);
+                       if (ptcList == null)
+                               continue;
+                       for(ParseTreeChunk ch: ptcList){
+                               List<String> candidate = 
TextProcessor.fastTokenize(ch.toWordOnlyString(), false);
+                               List<String> tList = 
TextProcessor.fastTokenize(t, false);
+                               List<String> tListChk = new 
ArrayList<String>(tList);
+
+                               tListChk.removeAll(candidate);
+                               // fully covered by phrase instance
+                               if (!tListChk.isEmpty() || 
ch.toWordOnlyString().equals(t)){
+                                       continue;
+                               }
+
+                               boolean bCovered = true;
+                               
+                               for(String ts: tList){
+                                       boolean bCandWordsIsCovered = false;
+                                       for(String s: candidate){
+                                               if ((s.indexOf(ts)>-1) )//  && 
properSubWordForm(s, ts))
+                                                       bCandWordsIsCovered = 
true;
+                                       }
+                                       if (!bCandWordsIsCovered){
+                                               bCovered = false;
+                                               break;
+                                       }
+                               }
+                               if (!bCovered)
+                                       continue;
+                               bufCover+=ch.toWordOnlyString()+ " # ";
+                               count++;
+                               if (count > 40)
+                                       break;
+
+                       }
+                       if (bufCover.endsWith(" # "))
+                               bufCover = bufCover.substring(0, 
bufCover.length()-3).trim();
+
+                       String buf = "";
+                       count = 0;
+                       // only up to 40 instances of phrases per 1-st level 
topic
+                       for(ParseTreeChunk ch: ptcList){
+                               buf+=ch.toWordOnlyString()+ "|";
+                               count++;
+                               if (count > 40)
+                                       break;
+                       }
+                       
+                       //TODO uncomment
+                       //t = spell.getSpellCheckResult(t);
+                       report.add(new String[]{extractHeadNounFromPhrase(t), 
t, bufCover, buf //, std_group.get(t).toString()
+                       });
+               }
+               
+               
+               ProfileReaderWriter.writeReport(report, reportName);
+       }
+       // get a last word from a phrase (supposed to be a head noun)
+       private String extractHeadNounFromPhrase(String topic){
+               String[] tops = topic.split(" ");
+               int len = tops.length;
+               if (len>1){
+                       return tops[len-1];
+               }
+               else return topic;
+       }
+
+       // get last elem of a list
+       private String getLastElement(List<String> arrayList ){
+               if (arrayList != null && !arrayList.isEmpty()) {
+                       return arrayList.get(arrayList.size()-1);
+               }
+               return null;
+       }
+       /*
+        * Using Bing API to check if an extracted phrase can be found on the 
web, therefore is a meaningful phrase 
+        */
+       public List<String> verifyTopic(){
+               Set<String> phrases = freq.keySet();
+               List<String> approvedPhrases = new ArrayList<String>();
+               for(String p: phrases){
+                       List<HitBase> hits = runner.runSearch("\""+p+"\"");
+                       for(HitBase h: hits){
+                               String lookup = h.getTitle() + " " + 
h.getAbstractText();
+                               if (lookup.indexOf(p)>-1){
+                                       approvedPhrases.add(p);
+                                       break;
+                               }
+                       }
+               }
+               return approvedPhrases;
+       }
+
+       public Set<String> getPhraseLookup(){
+               return freq.keySet();
+       }
+
+       // using phrase frequency to filter phrases
+       public boolean isAcceptablePhrase(String phrase){
+               Integer count = freq.get(phrase.toLowerCase().trim());
+               if (count==null)
+                       return false;
+
+               if (count>0 && count < 10000)
+                       return true;
+               return false;
+       }
+
+       public static void main(String[] args){
+               LinguisticPhraseManager man = new  LinguisticPhraseManager(
+                               
"/Users/bgalitsky/Documents/workspace/move_com/phrasesOfInterest.csv");
+               man.doLingGrouping();
+               man.generateGroupingReport("topics_groups7_mergedHeads.csv");
+               List<String> stdTopics = man.formStandardizedTopic();
+               man.applyLastRoundOfAggregation();
+               man.generateStdTopicReport("std_topics7_mergedHeads.csv");
+               System.out.println(stdTopics);
+
+       }
+}

[46/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

Reply via email to