[10/11] opennlp-sandbox git commit: removed stanford nlp refs

bgalitsky Tue, 22 Nov 2016 05:06:13 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java
deleted file mode 100644
index 9fe9524..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.external_rst;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.logging.Logger;
-
-import edu.arizona.sista.discourse.rstparser.DiscourseTree;
-import edu.arizona.sista.processors.CorefMention;
-import edu.arizona.sista.processors.Document;
-import edu.arizona.sista.processors.Processor;
-import edu.arizona.sista.processors.Sentence;
-import edu.arizona.sista.processors.corenlp.CoreNLPProcessor;
-import edu.arizona.sista.struct.DirectedGraphEdgeIterator;
-import edu.stanford.nlp.ie.AbstractSequenceClassifier;
-import edu.stanford.nlp.ie.crf.CRFClassifier;
-import edu.stanford.nlp.ling.CoreLabel;
-import edu.stanford.nlp.trees.Tree;
-import opennlp.tools.parse_thicket.ArcType;
-import opennlp.tools.parse_thicket.Pair;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
-import 
opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder;
-import scala.Option;
-
-public class ParseCorefBuilderWithNERandRST {  
-       public Processor proc = null;
-       CommunicativeActionsArcBuilder caFinder = new 
CommunicativeActionsArcBuilder();
-       private static Logger log = Logger
-                     
.getLogger("opennlp.tools.parse_thicket.external_rst.ParseCorefBuilderWithNERandRST");
-
-
-       AbstractSequenceClassifier<CoreLabel> classifier = null;
-
-       ParseCorefBuilderWithNERandRST() {
-               super();
-               classifier = CRFClassifier.getDefaultClassifier();
-               proc = new CoreNLPProcessor(true, true, 100);
-       }
-
-       public ParseThicketWithDiscourseTree buildParseThicket(String text){
-               List<Tree> ptTrees = new ArrayList<Tree>();
-               List<WordWordInterSentenceRelationArc> arcs = new 
ArrayList<WordWordInterSentenceRelationArc>();
-               List<List<ParseTreeNode>> nodesThicket = new 
ArrayList<List<ParseTreeNode>>();
-
-               Document doc=null;
-        try {
-               doc = proc.annotate(text, false);
-        } catch (IllegalArgumentException iae) {
-               log.severe("failed to parse text: "+text);
-        } catch (Exception e) {
-               e.printStackTrace();
-        }
-        // failed to parse - skip this text
-               if (doc==null)
-                       return null;
-               //java.lang.IllegalArgumentException
-               for (Sentence sentence: doc.sentences()) {
-                       List<ParseTreeNode> sentenceNodes = new 
ArrayList<ParseTreeNode>();
-                       String[] tokens= sentence.words();
-                       for(int i=0; i< tokens.length; i++){
-                               //sentence.startOffsets(), " "));
-                               //sentence.endOffsets(), " "));
-                               ParseTreeNode p = new 
ParseTreeNode(sentence.words()[i], sentence.tags().get()[i]);
-                               p.setId(i+1);
-                               if(sentence.entities().isDefined()){
-                                       p.setNe(sentence.entities().get()[i]);
-                               }
-                               if(sentence.norms().isDefined()){
-                                       
//p.setNormalizedWord(sentence.norms().get()[i]);
-                                       
p.setNormalizedWord(sentence.lemmas().get()[i]);
-                               }
-                               sentenceNodes.add(p);
-                       }
-
-                       if(sentence.dependencies().isDefined()) {
-                               int i=0;
-                               DirectedGraphEdgeIterator<String> iterator = new
-                                               
DirectedGraphEdgeIterator<String>(sentence.dependencies().get());
-                               while(iterator.hasNext()) {
-                                       scala.Tuple3<Object, Object, String> 
dep = iterator.next();
-                                       //System.out.println(" head:" + 
dep._1() + " modifier:" + dep._2() + " label:" + dep._3());
-                                       if (i>sentenceNodes.size()-1)
-                                               break;
-                                       ParseTreeNode p = sentenceNodes.get(i);
-                                       p.setHead(dep._1().toString());
-                                       p.setModifier(dep._2().toString());
-                                       p.setLabel(dep._3());
-                                       sentenceNodes.set(i, p);
-                                       i++;
-                               }
-                       }
-                       if(sentence.syntacticTree().isDefined()) {
-                               Tree tree = 
Tree.valueOf(sentence.syntacticTree().get().toString());
-                               ptTrees.add(tree);
-                               //tree.pennPrint();
-                       }
-                       nodesThicket.add(sentenceNodes);
-               }
-
-               if(doc.coreferenceChains().isDefined()) {
-                       // these are scala.collection Iterator and Iterable 
(not Java!)
-                       
scala.collection.Iterator<scala.collection.Iterable<CorefMention>> chains = 
doc.coreferenceChains().get().getChains().iterator();
-                       while(chains.hasNext()) {
-                               scala.collection.Iterator<CorefMention> chain = 
chains.next().iterator();
-                               //System.out.println("Found one coreference 
chain containing the following mentions:");
-                               int numInChain = 0;
-                               int[] niSentence = new int[4], niWord = new 
int[4], startOffset = new int[4], endOffset = new int[4];
-
-                               while(chain.hasNext()) {
-                                       CorefMention mention = chain.next();
-                                       // note that all these offsets start at 
0 too
-                                       niSentence[numInChain ] = 
mention.sentenceIndex();
-                                       niWord[numInChain ] = 
mention.headIndex();
-                                       startOffset[numInChain ] = 
mention.startOffset();
-                                       endOffset[numInChain ] = 
mention.endOffset();
-                                       if (numInChain>=4-1)
-                                               break;
-                                       numInChain++;
-                                       //" headIndex:" + mention.headIndex() +
-                                       //" startTokenOffset:" + 
mention.startOffset() +
-                                       //" endTokenOffset:" + 
mention.endOffset());
-                               }
-                               if (numInChain>0) { // more than a single 
mention
-                                       for(int i=0; i<numInChain; i++){
-                                               ArcType arcType = new 
ArcType("coref-", "", 0, 0);
-
-                                               
WordWordInterSentenceRelationArc arc = 
-                                                               new 
WordWordInterSentenceRelationArc(new Pair<Integer, 
Integer>(niSentence[i],niWord[i]), 
-                                                                               
new Pair<Integer, Integer>(niSentence[i+1],niWord[i+1]), 
-                                                                           
startOffset[i]+"", startOffset[i+1]+"",
-                                                         arcType);
-                                               arcs.add(arc);
-                                       }
-                               }
-                       }
-               }
-
-
-               List<WordWordInterSentenceRelationArc> arcsCA = 
buildCAarcs(nodesThicket);
-               arcs.addAll(arcsCA);
-               ParseThicketWithDiscourseTree result = new 
ParseThicketWithDiscourseTree(ptTrees, arcs);
-
-               if(doc.discourseTree().isDefined()) {
-                       Option<DiscourseTree> discourseTree = 
doc.discourseTree();
-
-                       //scala.collection.immutable.List<DiscourseTree> scList 
= discourseTree.toList();
-                       scala.collection.Iterator<DiscourseTree> iterator = 
discourseTree.iterator();
-                       while(iterator.hasNext()) {
-                               DiscourseTree dt = iterator.next();
-                               result.setDt(dt);
-                               List<WordWordInterSentenceRelationArc> rstArcs 
= new ArrayList<WordWordInterSentenceRelationArc>();
-                               navigateDiscourseTree(dt, rstArcs, nodesThicket 
);
-                               arcs.addAll(rstArcs);
-                               System.out.println(dt);
-                               System.out.println("first EDU = "+dt.firstEDU() 
+ "| dt.firstSentence() = "+ dt.firstSentence() + 
-                                               " \n| last EDU = "+dt.lastEDU() 
+ "| dt.lastSentence() = "+ dt.lastSentence() + 
-                                               " \n| dt.tokenCount() = " + 
dt.tokenCount() + "| dt.firstToken " + dt.firstToken() + 
-                                               " | dt.lastToken() "+ 
dt.lastToken() + "\n kind =" + dt.kind() + " | text = "+ dt.rawText());
-                               StringBuilder sb = new StringBuilder(10000);
-                               System.out.println(sb);
-                       }
-               }
-
-               result.setOrigText(text);
-               result.setNodesThicket(nodesThicket);
-               
-               result.setDtDump(); // sets the DT representation for TK 
learning
-               return result;
-       }
-
-       public List<WordWordInterSentenceRelationArc> buildCAarcs(
-                       List<List<ParseTreeNode>> nodesThicket) {
-               List<WordWordInterSentenceRelationArc> arcs = new 
ArrayList<WordWordInterSentenceRelationArc>();
-
-               for(int sentI=0; sentI<nodesThicket.size(); sentI++){
-                       for(int sentJ=sentI+1; sentJ<nodesThicket.size(); 
sentJ++){
-                               List<ParseTreeNode> sentenceI = 
nodesThicket.get(sentI), 
-                                               sentenceJ = 
nodesThicket.get(sentJ);
-                               Pair<String, Integer[]> caI = 
caFinder.findCAInSentence(sentenceI);
-                               Pair<String, Integer[]> caJ = 
caFinder.findCAInSentence(sentenceJ);
-                               int indexCA1 = 
caFinder.findCAIndexInSentence(sentenceI);
-                               int indexCA2 = 
caFinder.findCAIndexInSentence(sentenceJ);
-                               if (caI==null || caJ==null)
-                                       continue;
-                               Pair<String, Integer[]> caGen = 
caFinder.generalize(caI, caJ).get(0);
-
-                               ArcType arcType = new ArcType("ca", 
-                                               
caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);
-                               WordWordInterSentenceRelationArc arc = 
-                                               new 
WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1), 
-                                                               new 
Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(), 
-                                                               arcType);
-                               arcs.add(arc);
-
-                       }
-               }
-
-               return arcs;
-       }
-
-       private String printNumArray(Integer[] arr){
-               StringBuffer buf = new StringBuffer();
-               for(Integer i: arr){
-                       buf.append(Integer.toString(i)+ " ");
-               }
-               return buf.toString();
-       }
-
-       // creates a list of Arcs objects 'arcs' from the descourse tree dt, 
using the list of sentences 'nodesThicket' to identify words 
-       // for nodes being connected with these arcs
-       private void navigateDiscourseTree(DiscourseTree dt, 
List<WordWordInterSentenceRelationArc> arcs,  List<List<ParseTreeNode>> 
nodesThicket  ) {
-               if (dt.isTerminal()) {
-                       return;
-               } else {
-                       ArcType arcType = new ArcType("rst", 
-                                       dt.relationLabel()+ "=>" + dt.kind(), 
Boolean.compare(dt.relationDirection().equals("LeftToRight"), true),0);
-                       String lemmaFrom = 
nodesThicket.get(dt.firstSentence()).get(dt.firstToken().copy$default$2()).getWord();
-                       String lemmaTo = 
nodesThicket.get(dt.lastSentence()).get(dt.lastToken().copy$default$2()-1).getWord();
-                       
-                       WordWordInterSentenceRelationArc arc = 
-                                       new 
WordWordInterSentenceRelationArc(new Pair<Integer, 
Integer>(dt.firstToken().copy$default$1(), dt.firstToken().copy$default$2()), 
-                                                       new Pair<Integer, 
Integer>(dt.lastToken().copy$default$1(), dt.lastToken().copy$default$2()), 
-                                                       lemmaFrom,lemmaTo, 
-                                                       arcType);
-                       System.out.println(arc);
-                       arcs.add(arc);
-                       DiscourseTree[] kids = dt.children();
-                       if (kids != null) {
-                               for (DiscourseTree kid : kids) {
-                                       navigateDiscourseTree(kid, arcs, 
nodesThicket);
-                               }
-                       }
-                       return ;
-               }
-       }
-
-       public static void main(String[] args){
-               ParseCorefBuilderWithNERandRST builder = new 
ParseCorefBuilderWithNERandRST ();
-               String text = "I thought I d tell you a little about what I 
like to write. And I like to immerse myself in my topics. I just like to dive 
right in and become sort of a human guinea pig. And I see my life as a series 
of experiments. So , I work for Esquire magazine , and a couple of years ago I 
wrote an article called  My Outsourced Life ,  where I hired a team of people 
in Bangalore , India , to live my life for me. "
-               + "So they answered my emails. They answered my phone. ";
-               
-               ParseThicket pt = builder.buildParseThicket(text);
-               pt = builder.buildParseThicket(
-                               "Dutch accident investigators say that evidence 
points to pro-Russian rebels as being responsible for shooting down plane. The 
report indicates where the missile was fired from and identifies who was in 
control of the territory and pins the downing of the plane on the pro-Russian 
rebels. "+
-                                               "However, the Investigative 
Committee of the Russian Federation believes that the plane was hit by a 
missile from the air which was not produced in Russia. "+
-                                               "At the same time, rebels deny 
that they controlled the territory from which the missile was supposedly fired."
-                               );
-       }
-
-}


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java
deleted file mode 100644
index 44c843c..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java
+++ /dev/null
@@ -1,284 +0,0 @@
-package opennlp.tools.parse_thicket.external_rst;
-
-import java.util.List;
-
-import edu.arizona.sista.discourse.rstparser.DiscourseTree;
-import edu.stanford.nlp.trees.Tree;
-import opennlp.tools.parse_thicket.Pair;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.VerbNetProcessor;
-import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
-import 
opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree;
-
-/*
- * This is subclass of ParseThicket with the focus on Discourse Tree
- * It produces a representation of discourse tree for tree kernel learning
- */
-
-public class ParseThicketWithDiscourseTree extends ParseThicket {
-       private DiscourseTree dt;
-       private String dtDump;
-       private String dtDumpWithPOS;
-       private String dtDumpWithEmbeddedTrees;
-       private String dtDumpWithVerbNet;
-
-       private TreeExtenderByAnotherLinkedTree extender = new 
TreeExtenderByAnotherLinkedTree();
-       private VerbNetProcessor verbBuilder = 
VerbNetProcessor.getInstance(null);
-
-       public DiscourseTree getDt() {
-               return dt;
-       }
-       // sets the highest level DT (under further iterations does not set 
anything)
-       public void setDt(DiscourseTree dt) {
-               if (this.dt==null)
-                       this.dt = dt;
-       }
-
-       public ParseThicketWithDiscourseTree(List<Tree> ptTrees, 
List<WordWordInterSentenceRelationArc> barcs) {
-               super(ptTrees, barcs);
-       }
-
-       public void setDtDump(){
-               StringBuilder sb = new StringBuilder(100000);
-               StringBuilder res = toStringBuilderDTWithPOSSeq(sb, this.dt);
-               dtDumpWithPOS = res.toString();
-
-               sb = new StringBuilder(100000);
-               res = toStringBuilderDT(sb, this.dt);
-               dtDump = res.toString();
-
-               sb = new StringBuilder(100000);
-               res = toStringBuilderDTWithEmbeddedTrees(sb, this.dt);
-               dtDumpWithEmbeddedTrees = res.toString();
-
-               sb = new StringBuilder(100000);
-               res = toStringBuilderDTWithVerbNet(sb, this.dt);
-               dtDumpWithVerbNet = res.toString();
-       }
-       // basic representation of discourse tree 
-       private StringBuilder toStringBuilderDT(StringBuilder sb, DiscourseTree 
dt) {
-               if (dt.isTerminal()) {
-                       if (dt.relationLabel() != null) {
-                               sb.append(dt.relationLabel());
-                               //sb.append("("+dt.rawText()+")");
-                               scala.collection.mutable.StringBuilder sbs = 
new scala.collection.mutable.StringBuilder(100);
-
-                               dt.print(sbs, 0, false, true);
-                               String text  =  
sbs.replaceAllLiterally("Nucleus TEXT:", "(");
-                               text = text.substring(0, text.length()-1)+")";
-                               sb.append(text);
-                       }
-                       return sb;
-               } else {
-                       sb.append('(');
-                       if (dt.relationLabel() != null) {
-                               sb.append(dt.relationLabel());
-                       }
-                       DiscourseTree[] kids = dt.children();
-                       if (kids != null) {
-                               for (DiscourseTree kid : kids) {
-                                       sb.append(' ');
-                                       toStringBuilderDT(sb, kid);
-                               }
-                       }
-                       return sb.append(')');
-               }
-       }
-
-       private StringBuilder toStringBuilderDTWithPOSSeq(StringBuilder sb, 
DiscourseTree dt) {
-               if (dt.isTerminal()) {
-                       if (dt.relationLabel() != null && 
dt.relationLabel().length()>2) {
-                               sb.append(dt.relationLabel());
-                               // different StrBuilder for trees from scala
-                               scala.collection.mutable.StringBuilder sbs = 
new scala.collection.mutable.StringBuilder(100);
-                               dt.print(sbs, 0, false, true);
-                               String text  =  
sbs.replaceAllLiterally("Nucleus TEXT:", "");
-                               //text = text.substring(0, text.length()-1)+"";
-                               String textDump = 
substituteTextWithPOStext(text, 
this.getNodesThicket().get(dt.firstToken().copy$default$1()));
-                               sb.append(textDump);
-                       }
-                       return sb;
-               } else {
-                       sb.append('(');
-                       if (dt.relationLabel() != null) {
-                               sb.append(dt.relationLabel());
-                       }
-                       DiscourseTree[] kids = dt.children();
-                       if (kids != null) {
-                               for (DiscourseTree kid : kids) {
-                                       sb.append(' ');
-                                       toStringBuilderDTWithPOSSeq(sb, kid);
-                               }
-                       }
-                       return sb.append(')');
-               }
-       }
-
-       private String substituteTextWithPOStext(String text, 
List<ParseTreeNode> list) {
-               boolean bMatch = false;
-               String[] tokens = text.split(" ");
-               for(int offset = 0; offset<list.size(); offset++ ){             
-                       List<ParseTreeNode> subList = list.subList(offset, 
tokens.length+offset);
-                       int count = 0;
-                       bMatch = true; // if at least one mismatch
-                       for(ParseTreeNode n: subList){
-                               if (!n.getWord().equals(tokens[count])){
-                                       bMatch = false;
-                                       break;
-                               } else 
-                                       count++;
-                               if (count>3)
-                                       break;
-                       }
-                       if (bMatch){
-                               return //"(" + 
-                                               
ParseTreeNode.toTreeRepresentationString(subList); // + ")";
-                       }
-               }
-               return null;
-       }
-
-       private StringBuilder toStringBuilderDTWithEmbeddedTrees(StringBuilder 
sb, DiscourseTree dt) {
-               if (dt.isTerminal()) {
-                       if (dt.relationLabel() != null && 
dt.relationLabel().length()>2) {
-                               sb.append(dt.relationLabel());
-                               //sb.append("("+dt.rawText()+")");
-                               scala.collection.mutable.StringBuilder sbs = 
new scala.collection.mutable.StringBuilder(100);
-
-                               dt.print(sbs, 0, false, true);
-                               String text  =  
sbs.replaceAllLiterally("Nucleus TEXT:", "");
-                               //text = text.substring(0, text.length()-1)+"";
-                               substituteTextWithParseTree(sb, text, 
this.getSentenceTrees().get(dt.firstToken().copy$default$1()));
-                       }
-                       return sb;
-               } else {
-                       sb.append('(');
-                       if (dt.relationLabel() != null) {
-                               sb.append(dt.relationLabel());
-                       }
-                       DiscourseTree[] kids = dt.children();
-                       if (kids != null) {
-                               for (DiscourseTree kid : kids) {
-                                       sb.append(' ');
-                                       toStringBuilderDTWithEmbeddedTrees(sb, 
kid);
-                               }
-                       }
-                       return sb.append(')');
-               }
-       }
-       private void substituteTextWithParseTree(StringBuilder sb, String text, 
Tree sentenceTree) {
-               String[] tokens = text.split(" ");
-               List<Tree> foundTrees = null;
-               if (tokens.length>1){
-                       foundTrees = 
-                                       
extender.getASubtreeWithRootAsNodeForWord1(sentenceTree, sentenceTree, new 
String[]{tokens[0], tokens[1]});
-               }
-               else{
-                       foundTrees = 
-                                       
extender.getASubtreeWithRootAsNodeForWord1(sentenceTree, sentenceTree, new 
String[]{tokens[0]});
-
-               }
-
-               if (foundTrees == null || foundTrees.size()<1)
-                       return;
-
-               extender.toStringBuilder(sb, foundTrees.get(0));
-
-       }
-
-       private StringBuilder toStringBuilderDTWithVerbNet(StringBuilder sb, 
DiscourseTree dt) {
-               if (dt.isTerminal()) {
-                       if (dt.relationLabel() != null && 
dt.relationLabel().length()>2) {
-                               sb.append(dt.relationLabel());
-                               //sb.append("("+dt.rawText()+")");
-                               scala.collection.mutable.StringBuilder sbs = 
new scala.collection.mutable.StringBuilder(100);
-
-                               dt.print(sbs, 0, false, true);
-                               String text  =  
sbs.replaceAllLiterally("Nucleus TEXT:", "");
-                               String textDump = null;
-                               if (text.split(" ").length<100) // if not TOO 
long, more informative substitution, including VerbNets
-                                       textDump = 
substituteTextWithPOStextVerbNet(text, 
this.getNodesThicket().get(dt.firstToken().copy$default$1()));
-                               else // otherwise just lemma-POS chains
-                                       textDump = 
substituteTextWithPOStext(text, 
this.getNodesThicket().get(dt.firstToken().copy$default$1()));
-                               
-                                       
-                               sb.append(textDump);
-                       }
-                       return sb;
-               } else {
-                       sb.append('(');
-                       if (dt.relationLabel() != null) {
-                               sb.append(dt.relationLabel());
-                       }
-                       DiscourseTree[] kids = dt.children();
-                       if (kids != null) {
-                               for (DiscourseTree kid : kids) {
-                                       sb.append(' ');
-                                       toStringBuilderDTWithVerbNet(sb, kid);
-                               }
-                       }
-                       return sb.append(')');
-               }
-       }
-
-       // substitutes lemma-POS pair instead of just lemma
-       // in case of verb provides moe detailed info
-       private String substituteTextWithPOStextVerbNet(String text, 
List<ParseTreeNode> list) {
-               boolean bMatch = false;
-               String[] tokens = text.split(" ");
-               for(int offset = 0; offset<list.size(); offset++ ){             
-                       List<ParseTreeNode> subList = list.subList(offset, 
tokens.length+offset);
-                       int count = 0;
-                       bMatch = true; // if at least one mismatch
-                       for(ParseTreeNode n: subList){
-                               if (!n.getWord().equals(tokens[count])){
-                                       bMatch = false;
-                                       break;
-                               } else 
-                                       count++;
-                               if (count>3) // three tokens is enough for 
alignment
-                                       break;
-                       }
-                       // alignment found; now 
-                       if (bMatch){
-                               StringBuilder buf = new StringBuilder();
-                               for(ParseTreeNode ch: subList){
-                                       try {
-                           if (ch.getPos().startsWith(".") || 
ch.getPos().startsWith(",") || ch.getPos().startsWith(";") || 
ch.getPos().startsWith("!"))
-                               continue;
-                           if (ch.getPos().startsWith("VB") && 
ch.getNormalizedWord()!=null){ // do more info for verbs
-                               StringBuilder verbRepr = verbBuilder.
-                                               
buildTreeRepresentationForTreeKernelLearning(ch.getNormalizedWord());
-                               if (verbRepr!=null)
-                                       buf.append(" ("+verbRepr+") ");
-                               else
-                                       buf.append( "("+ch.getWord()+ " " + 
ch.getPos() + ")" );
-                           } else { // other than verb
-                               buf.append( "("+ch.getWord()+ " " + ch.getPos() 
+ ")" );
-                           }
-                    } catch (Exception e) {
-                           e.printStackTrace();
-                    }
-                               }
-                               return buf.toString().trim();
-                       }
-               }
-               return null;
-       }
-
-       public String getDtDump() {
-               return this.dtDump;
-       }
-       public String getDtDumpWithPOS() {
-               return this.dtDumpWithPOS;
-       }
-
-       public String getDtDumpWithEmbeddedTrees() {
-               return this.dtDumpWithEmbeddedTrees;
-       }
-       
-       public String getDtDumpWithVerbNet() {
-               return this.dtDumpWithVerbNet;
-       }
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java
deleted file mode 100644
index 61e8f13..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.external_rst;
-
-import org.apache.commons.lang.StringUtils;
-
-import opennlp.tools.similarity.apps.utils.Pair;
-
-public class RstNode {
-       Boolean isNucleus;
-       Pair<Integer, Integer> span;
-       Integer leaf;
-       String rel2par;
-       String text;
-       Integer level;
-       
-       public Boolean getIsNucleus() {
-               return isNucleus;
-       }
-       public void setIsNucleus(Boolean isNucleus) {
-               this.isNucleus = isNucleus;
-       }
-       public Pair<Integer, Integer> getSpan() {
-               return span;
-       }
-       public void setSpan(Pair<Integer, Integer> span) {
-               this.span = span;
-       }
-       public Integer getLeaf() {
-               return leaf;
-       }
-       public void setLeaf(Integer leaf) {
-               this.leaf = leaf;
-       }
-       public String getRel2par() {
-               return rel2par;
-       }
-       public void setRel2par(String rel2par) {
-               this.rel2par = rel2par;
-       }
-       public String getText() {
-               return text;
-       }
-       public void setText(String text) {
-               this.text = text;
-       }
-       
-       public String toString() {
-               String ret = "";
-               if (isNucleus!=null && isNucleus)
-                       ret+="Nucleus ";
-               if (span!=null)
-                       ret+="["+span.getFirst()+" "+ span.getSecond()+"]";
-               ret += " >> "+ rel2par;
-               if (text!=null)
-                       ret+= " >> "+text;
-               return ret;
-       }
-       public RstNode(String line) {
-               if (StringUtils.trim(line).startsWith(")"))
-                       return;
-               
-
-               level = line.indexOf("(");
-               line = line.substring(line.indexOf("(")+2);
-               
-               isNucleus = line.substring(0, 
line.indexOf("(")).indexOf("Nucleus")>-1;
-               line = line.substring(line.indexOf("(")+1);
-               if (line.startsWith("span")){
-                       line = line.substring(5);
-                       try {
-                               span = new Pair<Integer, Integer>();
-                               String[] spanStr = line.substring(0, 
line.indexOf(")")).split(" "); 
-                               span.setFirst(Integer.parseInt(spanStr[0]));
-                               span.setSecond(Integer.parseInt(spanStr[1]));
-                       } catch (Exception e) {
-                               e.printStackTrace();
-                       }
-                       
-               } else if (line.startsWith("leaf")){
-                       try {
-                               String leafStr = line.substring(5, 
line.indexOf(")"));
-                               leaf = Integer.parseInt(leafStr);
-                       } catch (Exception e) {
-                               e.printStackTrace();
-                       }
-                       
-               } else System.err.println("Problem parsing RST results: 
'"+line);
-               
-               line = line.substring(line.indexOf("rel2par")+8);
-               rel2par = line.substring(0, line.indexOf(")")).trim();
-               
-               text = StringUtils.substringBetween(line, "_!", "_!)");
-
-               
-       }
-
-        public static void main(String[] args){
-                RstNode n1 = new RstNode("        ( Nucleus (leaf 7) (rel2par 
span) (text _!that it usually takes a day_!) )"),
-                n2 = new RstNode("       )"),
-                n3 = new RstNode("          ( Satellite (span 15 16) (rel2par 
Explanation)");
-                
-        }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java
deleted file mode 100644
index b41cd46..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.kernel_interface;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Stack;
-
-//import org.apache.commons.io.FileUtils;
-
-public class BracesProcessor {
-       private static final int MIN_BRACES_CNT = 5;
-
-       private static final char L_PAREN    = '(';
-       private static final char R_PAREN    = ')';
-       private static final char L_BRACE    = '{';
-       private static final char R_BRACE    = '}';
-       private static final char L_BRACKET  = '[';
-       private static final char R_BRACKET  = ']';
-       private  Stack<Character> stackIncremental = new Stack<Character>();
-       private int count = 0;
-       private Boolean balancedSoFar = true;
-
-       public Boolean getBalancedBracesResult(){
-               if (balancedSoFar)
-                       return (stackIncremental.isEmpty() && count> 
MIN_BRACES_CNT);
-               else 
-                       return false;
-       }
-
-       public void analyzeBalancedBracesAddPortionIncremental(String s) {
-
-
-               for (int i = 0; i < s.length(); i++) {
-
-                       if      (s.charAt(i) == L_PAREN)   {
-                               stackIncremental.push(L_PAREN);
-                               count++;
-
-                       }
-
-                       else if (s.charAt(i) == L_BRACE)   {
-                               stackIncremental.push(L_BRACE);
-                               count++;
-                       }
-
-                       else if (s.charAt(i) == L_BRACKET){
-                               stackIncremental.push(L_BRACKET);
-                               count++;
-                       }
-
-                       else if (s.charAt(i) == R_PAREN) {
-                               if (stackIncremental.isEmpty())        
balancedSoFar = false;
-                               if (stackIncremental.pop() != L_PAREN) 
balancedSoFar = false;
-                       }
-
-                       else if (s.charAt(i) == R_BRACE) {
-                               if (stackIncremental.isEmpty())        
balancedSoFar = false;
-                               if (stackIncremental.pop() != L_BRACE) 
balancedSoFar = false;
-                       }
-
-                       else if (s.charAt(i) == R_BRACKET) {
-                               if (stackIncremental.isEmpty())        
balancedSoFar = false;
-                               if (stackIncremental.pop() != L_BRACKET) 
balancedSoFar = false;
-                       }
-
-                       // ignore all other characters
-
-               }
-
-       }
-
-       public static boolean isBalanced(String s) {
-               int count = 0;
-               Stack<Character> stack = new Stack<Character>();
-               for (int i = 0; i < s.length(); i++) {
-
-                       if      (s.charAt(i) == L_PAREN)   {
-                               stack.push(L_PAREN);
-                               count++;
-
-                       }
-
-                       else if (s.charAt(i) == L_BRACE)   {
-                               stack.push(L_BRACE);
-                               count++;
-                       }
-
-                       else if (s.charAt(i) == L_BRACKET){
-                               stack.push(L_BRACKET);
-                               count++;
-                       }
-
-                       else if (s.charAt(i) == R_PAREN) {
-                               if (stack.isEmpty())        return false;
-                               if (stack.pop() != L_PAREN) return false;
-                       }
-
-                       else if (s.charAt(i) == R_BRACE) {
-                               if (stack.isEmpty())        return false;
-                               if (stack.pop() != L_BRACE) return false;
-                       }
-
-                       else if (s.charAt(i) == R_BRACKET) {
-                               if (stack.isEmpty())        return false;
-                               if (stack.pop() != L_BRACKET) return false;
-                       }
-
-                       // ignore all other characters
-
-               }
-               return (stack.isEmpty());
-       }
-
-       public static boolean checkParentesis(String str)
-       {
-               if (str.isEmpty())
-                       return true;
-
-               Stack<Character> stack = new Stack<Character>();
-               for (int i = 0; i < str.length(); i++)
-               {
-                       char current = str.charAt(i);
-                       if (current == '{' || current == '(' || current == '[')
-                       {
-                               stack.push(current);
-                       }
-
-
-                       if (current == '}' || current == ')' || current == ']')
-                       {
-                               if (stack.isEmpty())
-                                       return false;
-
-                               char last = stack.peek();
-                               if (current == '}' && (last == '{' || current 
== ')')
-                                               && last == '(' || (current == 
']'
-                                               && last == '['))
-                                       stack.pop();
-                               else 
-                                       return false;
-                       }
-
-               }
-
-               return stack.isEmpty();
-       }
-
-       public static boolean isParenthesisMatch(String str) {
-               Stack<Character> stack = new Stack<Character>();
-
-               char c;
-               for(int i=0; i < str.length(); i++) {
-                       c = str.charAt(i);
-
-                       if(c == '{')
-                               return false;
-
-                       if(c == '(')
-                               stack.push(c);
-
-                       if(c == '{') {
-                               stack.push(c);
-                               if(c == '}')
-                                       if(stack.empty())
-                                               return false;
-                                       else if(stack.peek() == '{')
-                                               stack.pop();
-                       }
-                       else if(c == ')')
-                               if(stack.empty())
-                                       return false;
-                               else if(stack.peek() == '(')
-                                       stack.pop();
-                               else
-                                       return false;
-               }
-               return stack.empty();
-       }
-
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java
deleted file mode 100644
index 1c07719..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.kernel_interface;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.tika.Tika;
-import org.apache.tika.exception.TikaException;
-
-public class DescriptiveParagraphFromDocExtractor {
-       protected static  Tika tika = new Tika();
-       private static int MIN_PARA_LENGTH = 200, //120, 
-                       MIN_NUM_WORDS=15, 
-                       MAX_PARA_LENGTH = 500, //200 
-                       TEXT_PORTION_FOR_ANALYSIS = 20000, 
-                       MAX_PARA_OUTPUT=20;
-       public static String getFirstParagraphFromFile(File f) {
-
-               String text = "";
-               try {
-                       try {
-                               text = tika.parseToString(f);
-                       } catch (TikaException e) {
-                               // TODO Auto-generated catch block
-                               e.printStackTrace();
-                       } 
-                       //text = FileUtils.readFileToString(f, null);
-                       if (text.length()>TEXT_PORTION_FOR_ANALYSIS)
-                               text = text.substring(0, 
TEXT_PORTION_FOR_ANALYSIS);
-                       float avgSentSizeThr = (float)MIN_PARA_LENGTH/4f; //2f
-                       String[] portions = text.split("\\.\\n");
-                       for(String p: portions){
-                               float avgSentSize = 
(float)p.length()/(float)p.split("\\n\\n").length;
-
-                               if (p.length()> MIN_PARA_LENGTH && p.split(" 
").length>MIN_NUM_WORDS &&
-                                               avgSentSize > avgSentSizeThr && 
 p.length() < MAX_PARA_LENGTH){
-                                       return normalizePara(p);
-                               }
-                       }
-               } catch (IOException e) {
-                       e.printStackTrace();
-               }
-               // if not a suitable paragraph found, return the whole text
-               if (text.length()>150)
-                       text = text.substring(0, 150);
-               return text;
-       }
-
-       public static List<String> getLongParagraphsFromFile(File f) {
-               List<String> results = new ArrayList<String>();
-               String text = "";
-               try {
-                       try {
-                               text = tika.parseToString(f);
-                       } catch (TikaException e) {
-                               // TODO Auto-generated catch block
-                               e.printStackTrace();
-                       } 
-                       //text = FileUtils.readFileToString(f, null);
-                       if (text.length()>TEXT_PORTION_FOR_ANALYSIS)
-                               text = text.substring(0, 
TEXT_PORTION_FOR_ANALYSIS);
-                       float avgSentSizeThr = (float)MIN_PARA_LENGTH/4f; //2f
-                       String[] portions = text.split("\\.\\n");
-                       if (portions.length<2)
-                               portions = text.split("\\n\\n");
-                       if (portions.length<2)
-                               portions = text.split("\\n \\n");
-                       if (portions.length<2){
-                               String[] sentences = 
text.replace('.','&').split(" & ");
-                               List<String> portionsLst = new 
ArrayList<String>();
-                               int totalChars = 0;
-                               String buffer = "";
-                               for(String sent: sentences){
-                                       totalChars+=sent.length();
-                                       if (totalChars>MAX_PARA_LENGTH){
-                                               portionsLst.add(buffer);
-                                               buffer="";
-                                               totalChars = 0;
-                                       } else {
-                                               buffer+= sent + ". ";
-                                       }
-                               }
-                               portions = portionsLst.toArray(new String[0]);
-                       }
-                       for(String p: portions){
-                               try {
-                                       float avgSentSize = 
(float)p.length()/(float)p.split("\\n\\n").length;
-
-                                       if (p.length()> MIN_PARA_LENGTH && 
p.split(" ").length>MIN_NUM_WORDS &&
-                                                       avgSentSize > 
avgSentSizeThr) {  
-                                               if (p.length() < 
MAX_PARA_LENGTH){
-                                                       
results.add(normalizePara(p)); 
-                                               }
-                                               else { // reduce length to the 
latest '.' in substring
-                                                       
-                                                       String pReduced = p;
-                                                       if (p.length()>= 
MAX_PARA_LENGTH+80)
-                                                               pReduced = 
p.substring(0, MAX_PARA_LENGTH+80);
-                                                       int indexPeriod = 
pReduced.lastIndexOf('.');
-                                                       if (indexPeriod>-1){
-                                                               pReduced = 
pReduced.substring(0, indexPeriod);
-                                                       }
-                                                       
results.add(normalizePara(pReduced));
-                                               }
-                                               if 
(results.size()>MAX_PARA_OUTPUT)
-                                                       break;
-                                       }
-                               } catch (Exception e) {
-                                       e.printStackTrace();
-                               }
-                       }
-                       if (results.size()<1){
-                               if (text.length()>= MAX_PARA_LENGTH+80)
-                                       text = text.substring(0, 
MAX_PARA_LENGTH+80);
-                               results.add(text);
-                       }
-
-               } catch (IOException e) {
-                       e.printStackTrace();
-               }
-               if (results.size()<1){
-                       System.err.println("Failed to extract text from 
"+f.getName());
-               }
-
-               return results;
-       }
-
-       private static String normalizePara(String p){
-               p = p.replaceAll("\\n", " ").replaceAll("\\.\\.", " 
").replaceAll("  ", " ");
-               p = p.replaceAll("[^A-Za-z0-9 _\\.,\\!]", "");
-               return p;
-       }
-
-       public static void main(String args[]){
-               List<String> results = getLongParagraphsFromFile(new File(
-                               
"/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/design_doc_posNeg/pos/2IP40
 Detail Design Document.pdf"
-                               //+ " Online Screening Tool - Delottie.pdf"
-                               ));
-               System.out.println(results);
-
-               String res = getFirstParagraphFromFile(new 
File("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/"
-                               + 
"design_doc/2004Schalk_BCI2000Implementation.pdf"));
-               System.out.println(res);
-               results = getLongParagraphsFromFile(new 
File("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/"
-                               + 
"design_doc/2004Schalk_BCI2000Implementation.pdf"));
-               System.out.println(results);
-
-       }
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java
deleted file mode 100644
index c568035..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.kernel_interface;
-
-import java.io.File;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.logging.Logger;
-
-import edu.stanford.nlp.trees.Tree;
-import edu.stanford.nlp.util.StringUtils;
-
-
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;
-import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
-import opennlp.tools.parse_thicket.apps.WebPageContentSentenceExtractor;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.similarity.apps.BingQueryRunner;
-import opennlp.tools.similarity.apps.HitBase;
-import opennlp.tools.similarity.apps.HitBaseComparable;
-import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
-import opennlp.tools.textsimilarity.SentencePairMatchResult;
-import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
-
-public class MultiSentenceExtendedForestSearchResultsProcessorSetFormer  
extends MultiSentenceKernelBasedSearchResultsProcessor{
-       private static Logger LOG = Logger
-                       
.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedExtendedForestSearchResultsProcessor");
-       protected TreeExtenderByAnotherLinkedTree treeExtender = new 
TreeExtenderByAnotherLinkedTree();
-       
-       private TreeKernelRunner tkRunner = new TreeKernelRunner();
-       
-       protected static final String modelFileName = "model.txt";
-
-       private static final String trainingFileName = "training.txt";
-
-       protected static final String unknownToBeClassified = "unknown.txt";
-
-       private static final String classifierOutput = "classifier_output.txt";
-       
-       private String path;
-       public void setKernelPath (String path){
-               this.path=path;
-       }
-       
-       WebPageContentSentenceExtractor extractor = new 
WebPageContentSentenceExtractor();
-       
-       private List<HitBase> formTreeForestDataSet(
-                       List<HitBase> hits, String query, boolean isPositive) {
-               List<HitBase> newHitList = new ArrayList<HitBase>(), 
newHitListReRanked = new ArrayList<HitBase>();
-               // form the training set from original documents. Since search 
results are ranked, we set the first half as positive set,
-               //and the second half as negative set.
-               // after re-classification, being re-ranked, the search results 
might end up in a different set
-               List<String[]> treeBankBuffer = new ArrayList<String[]>();
-               int count = 0;
-               for (HitBase hit : hits) {
-                       count++;
-                       // if orig content has been already set in HIT object, 
ok; otherwise set it
-                       String searchResultText = hit.getPageContent();
-                       if (searchResultText ==null){
-                               try {
-                                       HitBase hitWithFullSents = 
extractor.formTextFromOriginalPageGivenSnippet(hit);
-                                       for(String paragraph: 
hitWithFullSents.getOriginalSentences()){
-                                               List<String[]> res = 
formTreeKernelStructure(paragraph, count, hits,  isPositive);
-                                               for(String[] rl : res){
-                                                       
StringUtils.printToFile(new File(path+trainingFileName), rl[0]+" \n", true);
-                                               }
-                                               //treeBankBuffer.addAll(res);
-                                       }
-                               } catch (Exception e) {
-                                       e.printStackTrace();
-                               }
-                               
-                       }                       
-                       newHitList.add(hit);
-                       
-                       
-               }       
-               // write the lits of samples to a file
-               ProfileReaderWriter.appendReport(treeBankBuffer, 
path+trainingFileName, ' ');
-               return newHitList;
-
-       }
-       
-       protected List<String[]> formTreeKernelStructure(String 
searchResultText, int count, List<HitBase> hits, boolean isPositive) {
-               List<String[]> treeBankBuffer = new ArrayList<String[]> ();
-               try {
-                       // get the parses from original documents, and form the 
training dataset
-                       ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(searchResultText);
-                       List<Tree> forest = pt.getSentences();
-                       // if from the first half or ranked docs, then 
positive, otherwise negative
-                       String posOrNeg = null;
-                       if (isPositive)
-                               posOrNeg=" 1 ";
-                       else 
-                               posOrNeg=" -1 ";
-                       // form the list of training samples
-                       for(Tree t: forest){
-                               treeBankBuffer.add(new String[] {posOrNeg+" 
|BT| "+t.toString()+ " |ET|"});
-                       }
-               } catch (Exception e) {
-                       e.printStackTrace();
-               }
-               return treeBankBuffer;
-       }
-       
-       public List<HitBase> runSearchViaAPI(String query, Boolean isPositive) {
-               
-               try {
-                       List<HitBase> hits = bingSearcher.runSearch(query, 20, 
true);
-                       formTreeForestDataSet(hits, query, isPositive);
-
-               } catch (Exception e) {
-                       e.printStackTrace();
-                       LOG.info("No search results for query '" + query);
-                       return null;
-               }
-
-
-               return null;
-       }
-       public static void main(String[] args){
-               String query = "digital camera for my mother as a gift";
-               Boolean isPositive = true;
-               if (args!=null && args.length>0){
-                       query = args[0];
-                       if (args.length>1 && args[1]!=null && 
args[1].startsWith("neg"))
-                               isPositive = false;
-               }
-               
-               MultiSentenceExtendedForestSearchResultsProcessorSetFormer proc 
= new MultiSentenceExtendedForestSearchResultsProcessorSetFormer();
-               proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel_big\\");
-               proc.runSearchViaAPI(query, isPositive);
-       }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java
deleted file mode 100644
index 1b2790f..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.kernel_interface;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.logging.Logger;
-
-import edu.stanford.nlp.trees.Tree;
-
-
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;
-import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.similarity.apps.BingQueryRunner;
-import opennlp.tools.similarity.apps.HitBase;
-import opennlp.tools.similarity.apps.HitBaseComparable;
-import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
-import opennlp.tools.textsimilarity.SentencePairMatchResult;
-import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
-
-public class MultiSentenceKernelBasedExtendedForestSearchResultsProcessor  
extends MultiSentenceKernelBasedSearchResultsProcessor{
-       private static Logger LOG = Logger
-                       
.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedExtendedForestSearchResultsProcessor");
-       protected TreeExtenderByAnotherLinkedTree treeExtender = new 
TreeExtenderByAnotherLinkedTree();
-       
-       
-       
-
-       protected List<String[]> formTreeKernelStructure(String 
searchResultText, int count, List<HitBase> hits) {
-               List<String[]> treeBankBuffer = new ArrayList<String[]> ();
-               try {
-                       // get the parses from original documents, and form the 
training dataset
-                       ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(searchResultText);
-                       List<String> extendedTreesDump = 
treeExtender.buildForestForCorefArcs(pt);
-                       // if from the first half or ranked docs, then 
positive, otherwise negative
-                       String posOrNeg = null;
-                       if (count<hits.size()/2)
-                               posOrNeg=" 1 ";
-                       else 
-                               posOrNeg=" -1 ";
-                       // form the list of training samples
-                       for(String t: extendedTreesDump){
-                               treeBankBuffer.add(new String[] {posOrNeg+" 
|BT| "+t+ " |ET|"});
-                       }
-               } catch (Exception e) {
-                       // TODO Auto-generated catch block
-                       e.printStackTrace();
-               }
-               return treeBankBuffer;
-       }
-
-       public static void main(String[] args){
-               String query = null;
-               
-               /*" I see no meaningful distinction between complacency or 
complicity in the military's latest failure to uphold their own " +
-                               "standards of conduct. Nor do I see a 
distinction between the service member who orchestrated this offense and the 
chain of " +
-                               "command that was either oblivious to or 
tolerant of criminal behavior";
-               
-               query = "I am now living abroad and have health insurance from 
Russia. How can I avoid penalty for not having health insurance in US";
-               
-               query = "ECUADOR'S PRESIDENT RAFAEL CORREA SAYS U.S. VP JOE 
BIDEN WANTS HIM TO REFUSE WHISTLEBLOWER EDWARD SNOWDEN'S BID FOR ASYLUM";
-               query = "how to pay tax on foreign income from real estate";
-               */
-               if (args!=null && args.length>0)
-                       query = args[0];
-               
-               MultiSentenceKernelBasedExtendedForestSearchResultsProcessor 
proc = new MultiSentenceKernelBasedExtendedForestSearchResultsProcessor();
-               proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel\\");
-               proc.runSearchViaAPI(query);
-       }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java
deleted file mode 100644
index 39d348e..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.tools.parse_thicket.kernel_interface;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.logging.Logger;
-
-import edu.stanford.nlp.trees.Tree;
-
-
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.apps.BingQueryRunnerMultipageSearchResults;
-import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;
-import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.similarity.apps.BingQueryRunner;
-import opennlp.tools.similarity.apps.HitBase;
-import opennlp.tools.similarity.apps.HitBaseComparable;
-import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
-import opennlp.tools.textsimilarity.SentencePairMatchResult;
-import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
-
-public class MultiSentenceKernelBasedSearchResultsProcessor  extends 
MultiSentenceSearchResultsProcessor{
-       private static Logger LOG = Logger
-                       
.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedSearchResultsProcessor");
-
-       private WebSearchEngineResultsScraper scraper = new 
WebSearchEngineResultsScraper();
-       protected Matcher matcher = new Matcher();
-       private ParseTreeChunkListScorer parseTreeChunkListScorer = new 
ParseTreeChunkListScorer();
-       protected BingQueryRunnerMultipageSearchResults bingSearcher = new 
BingQueryRunnerMultipageSearchResults();
-       private SnippetToParagraph snp = new SnippetToParagraph();
-       private TreeKernelRunner tkRunner = new TreeKernelRunner();
-
-       private String path;
-       public void setKernelPath (String path){
-               this.path=path;
-       }
-       protected static final String modelFileName = "model.txt";
-
-       private static final String trainingFileName = "training.txt";
-
-       protected static final String unknownToBeClassified = "unknown.txt";
-
-       private static final String classifierOutput = "classifier_output.txt";
-
-
-       public List<HitBase> runSearchViaAPI(String query) {
-               List<HitBase> hits = null;
-               try {
-                       List<HitBase> resultList = 
bingSearcher.runSearch(query);
-                       // now we apply our own relevance filter
-                       //hits = calculateMatchScoreResortHits(resultList, 
query);
-                       
-                       hits = resultList;
-                       //once we applied our re-ranking, we set highly ranked 
as positive set, low-rated as negative set
-                       //and classify all these search results again
-                       //training set is formed from original documents for 
the search results, 
-                       // and snippets of these search results are classified
-                       hits = 
filterOutIrrelevantHitsByTreeKernelLearning(hits, query);
-
-               } catch (Exception e) {
-                       e.printStackTrace();
-                       LOG.info("No search results for query '" + query);
-                       return null;
-               }
-
-
-               return hits;
-       }
-
-       private List<HitBase> filterOutIrrelevantHitsByTreeKernelLearning(
-                       List<HitBase> hits, String query) {
-               List<HitBase> newHitList = new ArrayList<HitBase>(), 
newHitListReRanked = new ArrayList<HitBase>();
-               // form the training set from original documents. Since search 
results are ranked, we set the first half as positive set,
-               //and the second half as negative set.
-               // after re-classification, being re-ranked, the search results 
might end up in a different set
-               List<String[]> treeBankBuffer = new ArrayList<String[]>();
-               int count = 0;
-               for (HitBase hit : hits) {
-                       count++;
-                       // if orig content has been already set in HIT object, 
ok; otherwise set it
-                       String searchResultText = hit.getPageContent();
-                       if (searchResultText ==null){
-                               String[] pageSentsAndSnippet = 
formTextForReRankingFromHit(hit);
-                               searchResultText = pageSentsAndSnippet[0];
-                               hit.setPageContent(searchResultText);
-                       }                       
-                       newHitList.add(hit);
-                       
treeBankBuffer.addAll(formTreeKernelStructure(searchResultText, count, hits));
-                       
-               }       
-               // write the lits of samples to a file
-               ProfileReaderWriter.writeReport(treeBankBuffer, 
path+trainingFileName, ' ');
-               // build the model
-               tkRunner.runLearner(path, trainingFileName, modelFileName);
-
-               // now we preparing the same answers to be classifies in/out
-               treeBankBuffer = new ArrayList<String[]>();
-               for (HitBase hit : newHitList) {                        
-                       // not original docs now but instead a snippet
-                       String searchResultTextAbstr = hit.getAbstractText();
-                       String snippet = 
searchResultTextAbstr.replace("<b>...</b>", ". ").replace("<span 
class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ")
-                                       .replace("<b>", "").replace("</b>", "");
-                       snippet = snippet.replace("</B>", "").replace("<B>", "")
-                                       .replace("<br>", "").replace("</br>", 
"").replace("...", ". ")
-                                       .replace("|", " ").replace(">", " 
").replace(". .", ". ");
-                       snippet =  hit.getTitle() + " " + snippet;
-                       
-                       ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(snippet);
-                                       //hit.getPageContent());
-                       List<Tree> forest = pt.getSentences();
-                       // we consider the snippet as a single sentence to be 
classified
-                       if (forest.size()>0){
-                               treeBankBuffer.add(new String[] {"0 |BT| 
"+forest.get(0).toString()+ " |ET|"});
-                               newHitListReRanked .add(hit);
-                       }
-
-               }       
-               // form a file from the snippets to be classified
-               ProfileReaderWriter.writeReport(treeBankBuffer, 
path+unknownToBeClassified, ' ');
-               tkRunner.runClassifier(path, unknownToBeClassified, 
modelFileName, classifierOutput);
-               // read classification results
-               List<String[]> classifResults = 
ProfileReaderWriter.readProfiles(path+classifierOutput, ' ');
-               // iterate through classification results and set them as 
scores for hits
-               newHitList = new ArrayList<HitBase>();
-               for(int i=0; i<newHitListReRanked.size() && 
i<classifResults.size() ; i++){
-                       String scoreClassif = classifResults.get(i)[0];
-                       float val = Float.parseFloat(scoreClassif);
-                       HitBase hit = newHitListReRanked.get(i);
-                       hit.setGenerWithQueryScore((double) val);
-                       newHitList.add(hit);
-               }
-               
-               // sort by SVM classification results
-               Collections.sort(newHitList, new HitBaseComparable());
-               System.out.println("\n\n ============= NEW ORDER 
================= ");
-               for (HitBase hit : newHitList) {
-                       
System.out.println(hit.getOriginalSentences().toString() + " => 
"+hit.getGenerWithQueryScore());
-                       System.out.println("page content = 
"+hit.getPageContent());
-                       System.out.println("title = "+hit.getAbstractText());
-                       System.out.println("snippet = "+hit.getAbstractText());
-                       System.out.println("match = "+hit.getSource());
-               }
-               
-               return newHitList;
-
-       }
-
-       protected List<String[]> formTreeKernelStructure(String 
searchResultText, int count, List<HitBase> hits) {
-               List<String[]> treeBankBuffer = new ArrayList<String[]> ();
-               try {
-                       // get the parses from original documents, and form the 
training dataset
-                       ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(searchResultText);
-                       List<Tree> forest = pt.getSentences();
-                       // if from the first half or ranked docs, then 
positive, otherwise negative
-                       String posOrNeg = null;
-                       if (count<hits.size()/2)
-                               posOrNeg=" 1 ";
-                       else 
-                               posOrNeg=" -1 ";
-                       // form the list of training samples
-                       for(Tree t: forest){
-                               treeBankBuffer.add(new String[] {posOrNeg+" 
|BT| "+t.toString()+ " |ET|"});
-                       }
-               } catch (Exception e) {
-                       // TODO Auto-generated catch block
-                       e.printStackTrace();
-               }
-               return treeBankBuffer;
-       }
-
-       public static void main(String[] args){
-               String query = " I see no meaningful distinction between 
complacency or complicity in the military's latest failure to uphold their own 
" +
-                               "standards of conduct. Nor do I see a 
distinction between the service member who orchestrated this offense and the 
chain of " +
-                               "command that was either oblivious to or 
tolerant of criminal behavior";
-               
-               query = "I am now living abroad and have health insurance from 
Russia. How can I avoid penalty for not having health insurance in US";
-               
-               MultiSentenceKernelBasedSearchResultsProcessor proc = new 
MultiSentenceKernelBasedSearchResultsProcessor();
-               proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel\\");
-               proc.runSearchViaAPI(query);
-       }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java
deleted file mode 100644
index fb5eed8..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.kernel_interface;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import edu.stanford.nlp.trees.Tree;
-
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-
-public class PT2ExtendedTreeForestBuilder {
-       private Matcher matcher = new Matcher();        
-       private TreeKernelRunner tkRunner = new TreeKernelRunner();
-       private static final String modelFileName = "model.txt",
-                       trainingFileName = "training.txt";
-       
-       private List<String[]> formTrainingSetFromText(String para,  boolean 
positive){
-               String prefix = null;
-               if (positive)
-                       prefix=" 1 ";
-               else
-                       prefix=" -1 ";
-                       
-               ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(para);
-               List<Tree> forest = pt.getSentences();
-               List<String[]> treeBankBuffer = new ArrayList<String[]>();
-               for(Tree t: forest){
-                       treeBankBuffer.add(new String[] {prefix+"|BT| 
"+t.toString()+ " |ET|"});
-               }
-               return treeBankBuffer;
-       }
-       
-       private String formTrainingSetFromTextOneLine(String para,  boolean 
positive){
-               String prefix = null;
-               if (positive)
-                       prefix=" 1 ";
-               else
-                       prefix=" -1 ";
-                       
-               ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(para);
-               List<Tree> forest = pt.getSentences();
-               String line = prefix;
-               for(Tree t: forest){
-                       line+= "|BT| "+t.toString()+ " |ET| ";
-               } 
-               return line;
-       }
-       
-       public void formPosNegTrainingSet(String pos, String neg, String path){
-               List<String[]> list = formTrainingSetFromText(pos,  true), 
-                               negList= formTrainingSetFromText(neg, false);
-               list.addAll(negList);
-               ProfileReaderWriter.writeReport(list, path+trainingFileName, ' 
');
-               tkRunner.runLearner(path, trainingFileName, modelFileName);
-       }
-       
-       public void classifySentences(String sentences, String path){
-               ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(sentences);
-               List<Tree> forest = pt.getSentences();
-               List<String[]> treeBankBuffer = new ArrayList<String[]>();
-               for(Tree t: forest){
-                       treeBankBuffer.add(new String[] {" 0 |BT| 
"+t.toString()+ " |ET|"});
-               }
-               
-               ProfileReaderWriter.writeReport(treeBankBuffer, 
path+"unknown.txt", ' ');
-               tkRunner.runClassifier(path, "unknown.txt", modelFileName, 
"classifier_output.txt");
-       }
-       
-       
-       public static void main(String[] args){
-               
-               PT2ExtendedTreeForestBuilder builder = new 
PT2ExtendedTreeForestBuilder();
-               
-                       
-               String posSents = "Iran refuses to accept the UN proposal to 
end its dispute over its work on nuclear weapons."+
-                               "UN nuclear watchdog passes a resolution 
condemning Iran for developing its second uranium enrichment site in secret. " +
-                               "A recent IAEA report presented diagrams that 
suggested Iran was secretly working on nuclear weapons. " +
-                               "Iran envoy says its nuclear development is for 
peaceful purpose, and the material evidence against it has been fabricated by 
the US. ";
-
-               String negSents = "Iran refuses the UN offer to end a conflict 
over its nuclear weapons."+
-                                               "UN passes a resolution 
prohibiting Iran from developing its uranium enrichment site. " +
-                                               "A recent UN report presented 
charts saying Iran was working on nuclear weapons. " +
-                               "Iran envoy to UN states its nuclear 
development is for peaceful purpose, and the evidence against its claim is 
fabricated by the US. ";
-               builder.formPosNegTrainingSet(posSents, negSents, 
"C:\\stanford-corenlp\\tree_kernel\\");
-               
-               
-               builder.classifySentences("Iran refuses Iraq's offer to end its 
conflict with UN. Iran passes a resolution prohibiting UN from doing second" +
-                               " uranium enrichment site. Envoy to US says its 
nuclear development is for peaceful purposes. Material evidence againt US has 
been fabricated by UN.", 
-                               
-                               "C:\\stanford-corenlp\\tree_kernel\\");
-       }
-}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java
deleted file mode 100644
index d6a295f..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.kernel_interface;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.logging.Logger;
-
-import org.apache.commons.lang.StringUtils;
-
-
-import opennlp.tools.parse_thicket.apps.MinedSentenceProcessor;
-import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
-import opennlp.tools.similarity.apps.Fragment;
-import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
-import opennlp.tools.similarity.apps.HitBase;
-import opennlp.tools.similarity.apps.RelatedSentenceFinder;
-import opennlp.tools.similarity.apps.utils.PageFetcher;
-import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
-import opennlp.tools.similarity.apps.utils.Utils;
-import opennlp.tools.textsimilarity.TextProcessor;
-
-
-public class SnippetToParagraphFull extends SnippetToParagraph {
-       private PageFetcher pFetcher = new PageFetcher();
-       private static Logger LOG = Logger
-                       
.getLogger("com.become.parse_thicket.apps.SnippetToParagraphFull");
-
-       
-
-       public HitBase formTextFromOriginalPageGivenSnippet(HitBase item) {
-
-               String[] sents = extractSentencesFromPage(item.getUrl());
-
-               String title = item.getTitle().replace("<b>", " 
").replace("</b>", " ")
-                               .replace("  ", " ").replace("  ", " ");
-               // generation results for this sentence
-               List<String> result = new ArrayList<String>();
-               // form plain text from snippet
-               String snapshot = item.getAbstractText().replace("<b>", " ")
-                               .replace("</b>", " ").replace("  ", " 
").replace("  ", " ").replace("\"", "");
-
-               String snapshotMarked = snapshot.replace(" ...", ".");
-               List<String> fragments = 
TextProcessor.splitToSentences(snapshotMarked);
-               if (fragments.size()<3 && 
StringUtils.countMatches(snapshotMarked, ".")>1){
-                       snapshotMarked = snapshotMarked.replace("..", 
"&").replace(".", "&");
-                       String[] fragmSents = snapshotMarked.split("&");
-                       fragments = Arrays.asList(fragmSents);
-               }
-
-               for (String f : fragments) {
-                       String followSent = null;
-                       if (f.length() < 50)
-                               continue;
-                       String pageSentence = "";
-                       // try to find original sentence from webpage
-
-                       try {
-                               String[] mainAndFollowSent = 
getFullOriginalSentenceFromWebpageBySnippetFragment(
-                                               f, sents);
-                               pageSentence = mainAndFollowSent[0];
-                               followSent = mainAndFollowSent[1];
-                               if (pageSentence!=null)
-                                       result.add(pageSentence);
-                               else {
-                                       result.add(f);
-                                       LOG.info("Could not find the original 
sentence \n"+f +"\n in the page " );
-                               }
-                               //if (followSent !=null)
-                               //      result.add(followSent);
-                       } catch (Exception e) {
-
-                               e.printStackTrace();
-                       }
-               }
-               item.setOriginalSentences(result);
-               return item;
-       }
-
-       
-}
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java
deleted file mode 100644
index c980f9f..0000000
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.parse_thicket.kernel_interface;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.logging.Logger;
-
-import opennlp.tools.jsmlearning.ProfileReaderWriter;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.VerbNetProcessor;
-import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
-import opennlp.tools.parse_thicket.matching.Matcher;
-import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
-import edu.stanford.nlp.trees.Tree;
-
-public class TreeExtenderByAnotherLinkedTree extends  PT2ThicketPhraseBuilder {
-       private static Logger log = Logger
-                     
.getLogger("opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree");
-
-       public List<String> buildForestForCorefArcs(ParseThicket pt){
-               List<String> results = new ArrayList<String>();
-               for(WordWordInterSentenceRelationArc arc: pt.getArcs()){
-                       //if (!arc.getArcType().getType().startsWith("coref"))
-                       //      continue;
-                       int fromSent = arc.getCodeFrom().getFirst();
-                       int toSent = arc.getCodeTo().getFirst();
-                       if (fromSent <1 || toSent <1 ) // TODO problem in 
sentence enumeration => skip building extended trees
-                               return results;
-                       
-                       String wordFrom = arc.getLemmaFrom();
-                       String wordTo = arc.getLemmaTo();
-
-                       List<Tree> trees = 
getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent-1), 
-                                       pt.getSentences().get(fromSent-1), new 
String[]{ wordFrom});
-                       if (trees==null || trees.size()<1)
-                               continue;
-                       System.out.println(trees);
-                       StringBuilder sb = new StringBuilder(10000);    
-                       toStringBuilderExtenderByAnotherLinkedTree1(sb, 
pt.getSentences().get(toSent-1), trees.get(0), new String[]{wordTo});
-                       System.out.println(sb.toString());
-                       results.add(sb.toString());
-               }
-               // if no arcs then orig sentences
-               if (results.isEmpty()){
-                       for(Tree t: pt.getSentences()){
-                               results.add(t.toString());
-                       }
-               }
-               return results;
-       }
-       // sentences in pt are enumerarted starting from 0;
-       //this func works with Sista version of Stanford NLP and sentences are 
coded from 0
-       public List<String> buildForestForRSTArcs(ParseThicket pt){
-               List<String> results = new ArrayList<String>();
-               for(WordWordInterSentenceRelationArc arc: pt.getArcs()){
-                       // TODO - uncomment
-                       //if (!arc.getArcType().getType().startsWith("rst"))
-                       //   continue;
-                       int fromSent = arc.getCodeFrom().getFirst();
-                       int toSent = arc.getCodeTo().getFirst();
-                       
-                       String wordFrom = arc.getLemmaFrom();
-                       String wordTo = arc.getLemmaTo();
-                       
-                       if (wordFrom == null || wordFrom.length()<1 || wordTo 
== null || wordTo.length()<1) 
-                               log.severe("Empty lemmas for RST arc "+ arc);
-
-                       List<Tree> trees = 
getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent), 
-                                       pt.getSentences().get(fromSent), new 
String[]{ wordFrom});
-                       if (trees==null || trees.size()<1)
-                               continue;
-                       System.out.println(trees);
-                       StringBuilder sb = new StringBuilder(10000);    
-                       Tree tree = trees.get(0);
-                       // instead of phrase type for the root of the tree, we 
want to put the RST relation name
-                       if (arc.getArcType().getType().startsWith("rst"))
-                               tree.setValue(arc.getArcType().getSubtype());
-                       
-                       toStringBuilderExtenderByAnotherLinkedTree1(sb, 
pt.getSentences().get(toSent), tree, new String[]{wordTo});
-                       System.out.println(sb.toString());
-                       results.add(sb.toString());
-               }
-               // if no arcs then orig sentences
-               if (results.isEmpty()){
-                       for(Tree t: pt.getSentences()){
-                               results.add(t.toString());
-                       }
-               }
-               return results;
-       }
-
-       public StringBuilder 
toStringBuilderExtenderByAnotherLinkedTree1(StringBuilder sb, Tree t, Tree 
treeToInsert, String[] corefWords) {
-               if (t.isLeaf()) {
-                       if (t.label() != null) {
-                               sb.append(t.label().value());
-                       }
-                       return sb;
-               } else {
-                       sb.append('(');
-                       if (t.label() != null) {
-                               if (t.value() != null) {
-                                       sb.append(t.label().value());
-                               }
-                       }
-                       boolean bInsertNow=false;
-                       Tree[] kids = t.children();
-                       if (kids != null) {
-                               for (Tree kid : kids) {
-                                       if (corefWords!=null){
-                                               String word = 
corefWords[corefWords.length-1];
-                                               String phraseStr = 
kid.toString();
-                                               
phraseStr=phraseStr.replace(")", "");
-                                               if (phraseStr.endsWith(word)){
-                                                       bInsertNow=true;
-                                               }
-                                       }
-                               }
-                               if (bInsertNow){ 
-                                       for (Tree kid : kids) {
-                                               sb.append(' ');
-                                               
toStringBuilderExtenderByAnotherLinkedTree1(sb, kid, null, null);
-                                       }
-                                       sb.append(' ');
-                                       
toStringBuilderExtenderByAnotherLinkedTree1(sb, treeToInsert, null, null);
-                               } else {
-                                       for (Tree kid : kids) {
-                                               sb.append(' ');
-                                               
toStringBuilderExtenderByAnotherLinkedTree1(sb, kid, treeToInsert, corefWords);
-                                       }
-
-                               }
-                       }
-
-                       return sb.append(')');
-               }
-       }
-
-       // given a parse tree and a 
-       public List<Tree> getASubtreeWithRootAsNodeForWord1(Tree tree, Tree 
currentSubTree, String[] corefWords){
-               if (currentSubTree.isLeaf()){
-                       return null;
-               }
-               List<Tree> result = null;
-               Tree[] kids = currentSubTree.children();
-               if (kids != null) {
-                       boolean bFound=false;
-                       String word = corefWords[corefWords.length-1];
-                       for (Tree kid : kids) {
-                               if (bFound){
-                                       result.add(kid);
-                               } else {
-                                       String phraseStr = kid.toString();
-                                       phraseStr=phraseStr.replace(")", "");
-                                       if (phraseStr.endsWith(word)){ // found 
-                                               bFound=true;
-                                               result = new ArrayList<Tree>();
-                                       }
-                               }
-                       }
-                       if (bFound){
-                               return result;
-                       }
-                       // if not a selected node, proceed with iteration
-                       for (Tree kid : kids) {
-                               List<Tree> ts = 
getASubtreeWithRootAsNodeForWord1(tree, kid, corefWords);
-                               if (ts!=null)
-                                       return ts;
-                       }
-
-               }
-               return null;
-       }
-
-       // now obsolete
-       public Tree[] getASubtreeWithRootAsNodeForWord(Tree tree, Tree 
currentSubTree, String[] corefWords){
-               if (currentSubTree.isLeaf()){
-                       return null;
-               }
-
-
-               boolean bInsertNow=false;
-               /*List<ParseTreeNode> bigTreeNodes = 
parsePhrase(currentSubTree.label().value());       
-               for(ParseTreeNode smallNode: bigTreeNodes ){
-                       if (bigTreeNodes.get(0).getWord().equals("") )
-                               continue;
-                       String word = bigTreeNodes.get(0).getWord();
-                       for(String cWord: corefWords){
-
-                               if (word.equalsIgnoreCase(cWord))
-                                       bInsertNow=true;
-                       }
-               } */
-
-               String nodePhraseStr = currentSubTree.toString();
-               System.out.println(nodePhraseStr);
-               for(String w: corefWords)
-                       nodePhraseStr = nodePhraseStr.replace(w, "");
-               // all words are covered
-               if (nodePhraseStr.toUpperCase().equals(nodePhraseStr))
-                       bInsertNow=true;
-
-               //if(bInsertNow)
-               //      return currentSubTree;
-
-               Tree[] kids = currentSubTree.children();
-               if (kids != null) {
-                       /*for (Tree kid : kids) {
-                               List<ParseTreeNode> bigTreeNodes = 
parsePhrase(kid.label().value());    
-                               if (bigTreeNodes!=null && bigTreeNodes.size()>0 
&& bigTreeNodes.get(0)!=null &&
-                                               
bigTreeNodes.get(0).getWord().equalsIgnoreCase(corefWords[0])){
-                                       bInsertNow=true;
-                                       return kids;
-                               }
-
-                       }*/
-
-
-                       for (Tree kid : kids) {
-                               Tree[] t = 
getASubtreeWithRootAsNodeForWord(tree, kid, corefWords);
-                               if (t!=null)
-                                       return t;
-                       }
-
-               }
-               return null;
-       }
-
-
-       public StringBuilder 
toStringBuilderExtenderByAnotherLinkedTree(StringBuilder sb, Tree t, Tree 
treeToInsert) {
-               if (t.isLeaf()) {
-                       if (t.label() != null) {
-                               sb.append(t.label().value());
-                       }
-                       return sb;
-               } else {
-                       sb.append('(');
-                       if (t.label() != null) {
-                               if (t.value() != null) {
-                                       sb.append(t.label().value());
-                               }
-                       }
-
-                       boolean bInsertNow=false;
-                       // we try match trees to find out if we are at the 
insertion position
-                       if (treeToInsert!=null){
-                               List<ParseTreeNode> bigTreeNodes = 
parsePhrase(t.label().value());      
-                               List<ParseTreeNode> smallTreeNodes = 
parsePhrase(treeToInsert.getChild(0).getChild(0).getChild(0).label().value());  
   
-
-                               System.out.println(t + " \n "+ treeToInsert+ 
"\n");
-
-                               if (smallTreeNodes.size()>0 && 
bigTreeNodes.size()>0)
-                                       for(ParseTreeNode smallNode: 
smallTreeNodes ){
-                                               if 
(!bigTreeNodes.get(0).getWord().equals("") 
-                                                               && 
bigTreeNodes.get(0).getWord().equalsIgnoreCase(smallNode.getWord()))
-                                                       bInsertNow=true;
-                                       }
-                       }
-
-                       if (bInsertNow){ 
-                               Tree[] kids = t.children();
-                               if (kids != null) {
-                                       for (Tree kid : kids) {
-                                               sb.append(' ');
-                                               
toStringBuilderExtenderByAnotherLinkedTree(sb, kid, null);
-                                       }
-                                       sb.append(' ');
-                                       
toStringBuilderExtenderByAnotherLinkedTree(sb, 
treeToInsert.getChild(0).getChild(1), null);
-                                       int z=0; z++;
-                               }
-                       } else {
-                               Tree[] kids = t.children();
-                               if (kids != null) {
-                                       for (Tree kid : kids) {
-                                               sb.append(' ');
-                                               
toStringBuilderExtenderByAnotherLinkedTree(sb, kid, treeToInsert);
-                                       }
-
-                               }
-                       }
-                       return sb.append(')');
-               }
-       }
-
-       public StringBuilder toStringBuilder(StringBuilder sb, Tree t) {
-               if (t.isLeaf()) {
-                       if (t.label() != null) {
-                               sb.append(t.label().value());
-                       }
-                       return sb;
-               } else {
-                       sb.append('(');
-                       if (t.label() != null) {
-                               if (t.value() != null) {
-                                       sb.append(t.label().value());
-                               }
-                       }
-                       Tree[] kids = t.children();
-                       if (kids != null) {
-                               for (Tree kid : kids) {
-                                       sb.append(' ');
-                                       toStringBuilder(sb, kid);
-                               }
-                       }
-                       return sb.append(')');
-               }
-       }
-
-       public static void main(String[] args){
-               VerbNetProcessor p = VerbNetProcessor.
-                               
getInstance("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources");
 
-                               
-               Matcher matcher = new Matcher();
-               TreeExtenderByAnotherLinkedTree extender = new 
TreeExtenderByAnotherLinkedTree();
-               
-               ParseThicket pt = matcher.buildParseThicketFromTextWithRST(//"I 
went to the forest to look for a tree. I found out that it was thick and 
green");
-                               "Iran refuses to accept the UN proposal to end 
its dispute over its work on nuclear weapons. "+
-                               "UN nuclear watchdog passes a resolution 
condemning Iran for developing its second uranium enrichment site in secret. " +
-                               "A recent IAEA report presented diagrams that 
suggested Iran was secretly working on nuclear weapons. " +
-                               "Iran envoy says its nuclear development is for 
peaceful purpose, and the material evidence against it has been fabricated by 
the US. ");
-
-               List<String> results = extender.buildForestForCorefArcs(pt);
-               System.out.println(results);
-               //System.exit(0);
-
-               List<Tree> forest = pt.getSentences();
-               
-               List<Tree> trees = 
extender.getASubtreeWithRootAsNodeForWord1(forest.get(1), forest.get(1), new 
String[]{"its"});
-               System.out.println(trees);
-               StringBuilder sb = new StringBuilder(10000);    
-               extender.toStringBuilderExtenderByAnotherLinkedTree1(sb, 
forest.get(0), trees.get(0), new String[]{"the", "forest"});
-               System.out.println(sb.toString());
-
-
-               //
-               //extender.toStringBuilderExtenderByAnotherLinkedTree(sb, 
forest.get(0), forest.get(1));
-               //System.out.println(sb.toString());
-       }
-}

[10/11] opennlp-sandbox git commit: removed stanford nlp refs

Reply via email to