http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java deleted file mode 100644 index 9fe9524..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseCorefBuilderWithNERandRST.java +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.parse_thicket.external_rst; - -import java.util.ArrayList; -import java.util.List; -import java.util.logging.Logger; - -import edu.arizona.sista.discourse.rstparser.DiscourseTree; -import edu.arizona.sista.processors.CorefMention; -import edu.arizona.sista.processors.Document; -import edu.arizona.sista.processors.Processor; -import edu.arizona.sista.processors.Sentence; -import edu.arizona.sista.processors.corenlp.CoreNLPProcessor; -import edu.arizona.sista.struct.DirectedGraphEdgeIterator; -import edu.stanford.nlp.ie.AbstractSequenceClassifier; -import edu.stanford.nlp.ie.crf.CRFClassifier; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.trees.Tree; -import opennlp.tools.parse_thicket.ArcType; -import opennlp.tools.parse_thicket.Pair; -import opennlp.tools.parse_thicket.ParseThicket; -import opennlp.tools.parse_thicket.ParseTreeNode; -import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc; -import opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder; -import scala.Option; - -public class ParseCorefBuilderWithNERandRST { - public Processor proc = null; - CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder(); - private static Logger log = Logger - .getLogger("opennlp.tools.parse_thicket.external_rst.ParseCorefBuilderWithNERandRST"); - - - AbstractSequenceClassifier<CoreLabel> classifier = null; - - ParseCorefBuilderWithNERandRST() { - super(); - classifier = CRFClassifier.getDefaultClassifier(); - proc = new CoreNLPProcessor(true, true, 100); - } - - public ParseThicketWithDiscourseTree buildParseThicket(String text){ - List<Tree> ptTrees = new ArrayList<Tree>(); - List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>(); - List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>(); - - Document doc=null; - try { - doc = proc.annotate(text, false); - } catch (IllegalArgumentException iae) { - log.severe("failed to parse text: "+text); - } catch (Exception e) { - e.printStackTrace(); - } - // failed to parse - skip this text - if (doc==null) - return null; - //java.lang.IllegalArgumentException - for (Sentence sentence: doc.sentences()) { - List<ParseTreeNode> sentenceNodes = new ArrayList<ParseTreeNode>(); - String[] tokens= sentence.words(); - for(int i=0; i< tokens.length; i++){ - //sentence.startOffsets(), " ")); - //sentence.endOffsets(), " ")); - ParseTreeNode p = new ParseTreeNode(sentence.words()[i], sentence.tags().get()[i]); - p.setId(i+1); - if(sentence.entities().isDefined()){ - p.setNe(sentence.entities().get()[i]); - } - if(sentence.norms().isDefined()){ - //p.setNormalizedWord(sentence.norms().get()[i]); - p.setNormalizedWord(sentence.lemmas().get()[i]); - } - sentenceNodes.add(p); - } - - if(sentence.dependencies().isDefined()) { - int i=0; - DirectedGraphEdgeIterator<String> iterator = new - DirectedGraphEdgeIterator<String>(sentence.dependencies().get()); - while(iterator.hasNext()) { - scala.Tuple3<Object, Object, String> dep = iterator.next(); - //System.out.println(" head:" + dep._1() + " modifier:" + dep._2() + " label:" + dep._3()); - if (i>sentenceNodes.size()-1) - break; - ParseTreeNode p = sentenceNodes.get(i); - p.setHead(dep._1().toString()); - p.setModifier(dep._2().toString()); - p.setLabel(dep._3()); - sentenceNodes.set(i, p); - i++; - } - } - if(sentence.syntacticTree().isDefined()) { - Tree tree = Tree.valueOf(sentence.syntacticTree().get().toString()); - ptTrees.add(tree); - //tree.pennPrint(); - } - nodesThicket.add(sentenceNodes); - } - - if(doc.coreferenceChains().isDefined()) { - // these are scala.collection Iterator and Iterable (not Java!) - scala.collection.Iterator<scala.collection.Iterable<CorefMention>> chains = doc.coreferenceChains().get().getChains().iterator(); - while(chains.hasNext()) { - scala.collection.Iterator<CorefMention> chain = chains.next().iterator(); - //System.out.println("Found one coreference chain containing the following mentions:"); - int numInChain = 0; - int[] niSentence = new int[4], niWord = new int[4], startOffset = new int[4], endOffset = new int[4]; - - while(chain.hasNext()) { - CorefMention mention = chain.next(); - // note that all these offsets start at 0 too - niSentence[numInChain ] = mention.sentenceIndex(); - niWord[numInChain ] = mention.headIndex(); - startOffset[numInChain ] = mention.startOffset(); - endOffset[numInChain ] = mention.endOffset(); - if (numInChain>=4-1) - break; - numInChain++; - //" headIndex:" + mention.headIndex() + - //" startTokenOffset:" + mention.startOffset() + - //" endTokenOffset:" + mention.endOffset()); - } - if (numInChain>0) { // more than a single mention - for(int i=0; i<numInChain; i++){ - ArcType arcType = new ArcType("coref-", "", 0, 0); - - WordWordInterSentenceRelationArc arc = - new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence[i],niWord[i]), - new Pair<Integer, Integer>(niSentence[i+1],niWord[i+1]), - startOffset[i]+"", startOffset[i+1]+"", - arcType); - arcs.add(arc); - } - } - } - } - - - List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket); - arcs.addAll(arcsCA); - ParseThicketWithDiscourseTree result = new ParseThicketWithDiscourseTree(ptTrees, arcs); - - if(doc.discourseTree().isDefined()) { - Option<DiscourseTree> discourseTree = doc.discourseTree(); - - //scala.collection.immutable.List<DiscourseTree> scList = discourseTree.toList(); - scala.collection.Iterator<DiscourseTree> iterator = discourseTree.iterator(); - while(iterator.hasNext()) { - DiscourseTree dt = iterator.next(); - result.setDt(dt); - List<WordWordInterSentenceRelationArc> rstArcs = new ArrayList<WordWordInterSentenceRelationArc>(); - navigateDiscourseTree(dt, rstArcs, nodesThicket ); - arcs.addAll(rstArcs); - System.out.println(dt); - System.out.println("first EDU = "+dt.firstEDU() + "| dt.firstSentence() = "+ dt.firstSentence() + - " \n| last EDU = "+dt.lastEDU() + "| dt.lastSentence() = "+ dt.lastSentence() + - " \n| dt.tokenCount() = " + dt.tokenCount() + "| dt.firstToken " + dt.firstToken() + - " | dt.lastToken() "+ dt.lastToken() + "\n kind =" + dt.kind() + " | text = "+ dt.rawText()); - StringBuilder sb = new StringBuilder(10000); - System.out.println(sb); - } - } - - result.setOrigText(text); - result.setNodesThicket(nodesThicket); - - result.setDtDump(); // sets the DT representation for TK learning - return result; - } - - public List<WordWordInterSentenceRelationArc> buildCAarcs( - List<List<ParseTreeNode>> nodesThicket) { - List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>(); - - for(int sentI=0; sentI<nodesThicket.size(); sentI++){ - for(int sentJ=sentI+1; sentJ<nodesThicket.size(); sentJ++){ - List<ParseTreeNode> sentenceI = nodesThicket.get(sentI), - sentenceJ = nodesThicket.get(sentJ); - Pair<String, Integer[]> caI = caFinder.findCAInSentence(sentenceI); - Pair<String, Integer[]> caJ = caFinder.findCAInSentence(sentenceJ); - int indexCA1 = caFinder.findCAIndexInSentence(sentenceI); - int indexCA2 = caFinder.findCAIndexInSentence(sentenceJ); - if (caI==null || caJ==null) - continue; - Pair<String, Integer[]> caGen = caFinder.generalize(caI, caJ).get(0); - - ArcType arcType = new ArcType("ca", - caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0); - WordWordInterSentenceRelationArc arc = - new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1), - new Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(), - arcType); - arcs.add(arc); - - } - } - - return arcs; - } - - private String printNumArray(Integer[] arr){ - StringBuffer buf = new StringBuffer(); - for(Integer i: arr){ - buf.append(Integer.toString(i)+ " "); - } - return buf.toString(); - } - - // creates a list of Arcs objects 'arcs' from the descourse tree dt, using the list of sentences 'nodesThicket' to identify words - // for nodes being connected with these arcs - private void navigateDiscourseTree(DiscourseTree dt, List<WordWordInterSentenceRelationArc> arcs, List<List<ParseTreeNode>> nodesThicket ) { - if (dt.isTerminal()) { - return; - } else { - ArcType arcType = new ArcType("rst", - dt.relationLabel()+ "=>" + dt.kind(), Boolean.compare(dt.relationDirection().equals("LeftToRight"), true),0); - String lemmaFrom = nodesThicket.get(dt.firstSentence()).get(dt.firstToken().copy$default$2()).getWord(); - String lemmaTo = nodesThicket.get(dt.lastSentence()).get(dt.lastToken().copy$default$2()-1).getWord(); - - WordWordInterSentenceRelationArc arc = - new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(dt.firstToken().copy$default$1(), dt.firstToken().copy$default$2()), - new Pair<Integer, Integer>(dt.lastToken().copy$default$1(), dt.lastToken().copy$default$2()), - lemmaFrom,lemmaTo, - arcType); - System.out.println(arc); - arcs.add(arc); - DiscourseTree[] kids = dt.children(); - if (kids != null) { - for (DiscourseTree kid : kids) { - navigateDiscourseTree(kid, arcs, nodesThicket); - } - } - return ; - } - } - - public static void main(String[] args){ - ParseCorefBuilderWithNERandRST builder = new ParseCorefBuilderWithNERandRST (); - String text = "I thought I d tell you a little about what I like to write. And I like to immerse myself in my topics. I just like to dive right in and become sort of a human guinea pig. And I see my life as a series of experiments. So , I work for Esquire magazine , and a couple of years ago I wrote an article called My Outsourced Life , where I hired a team of people in Bangalore , India , to live my life for me. " - + "So they answered my emails. They answered my phone. "; - - ParseThicket pt = builder.buildParseThicket(text); - pt = builder.buildParseThicket( - "Dutch accident investigators say that evidence points to pro-Russian rebels as being responsible for shooting down plane. The report indicates where the missile was fired from and identifies who was in control of the territory and pins the downing of the plane on the pro-Russian rebels. "+ - "However, the Investigative Committee of the Russian Federation believes that the plane was hit by a missile from the air which was not produced in Russia. "+ - "At the same time, rebels deny that they controlled the territory from which the missile was supposedly fired." - ); - } - -}
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java deleted file mode 100644 index 44c843c..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/ParseThicketWithDiscourseTree.java +++ /dev/null @@ -1,284 +0,0 @@ -package opennlp.tools.parse_thicket.external_rst; - -import java.util.List; - -import edu.arizona.sista.discourse.rstparser.DiscourseTree; -import edu.stanford.nlp.trees.Tree; -import opennlp.tools.parse_thicket.Pair; -import opennlp.tools.parse_thicket.ParseThicket; -import opennlp.tools.parse_thicket.ParseTreeNode; -import opennlp.tools.parse_thicket.VerbNetProcessor; -import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc; -import opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree; - -/* - * This is subclass of ParseThicket with the focus on Discourse Tree - * It produces a representation of discourse tree for tree kernel learning - */ - -public class ParseThicketWithDiscourseTree extends ParseThicket { - private DiscourseTree dt; - private String dtDump; - private String dtDumpWithPOS; - private String dtDumpWithEmbeddedTrees; - private String dtDumpWithVerbNet; - - private TreeExtenderByAnotherLinkedTree extender = new TreeExtenderByAnotherLinkedTree(); - private VerbNetProcessor verbBuilder = VerbNetProcessor.getInstance(null); - - public DiscourseTree getDt() { - return dt; - } - // sets the highest level DT (under further iterations does not set anything) - public void setDt(DiscourseTree dt) { - if (this.dt==null) - this.dt = dt; - } - - public ParseThicketWithDiscourseTree(List<Tree> ptTrees, List<WordWordInterSentenceRelationArc> barcs) { - super(ptTrees, barcs); - } - - public void setDtDump(){ - StringBuilder sb = new StringBuilder(100000); - StringBuilder res = toStringBuilderDTWithPOSSeq(sb, this.dt); - dtDumpWithPOS = res.toString(); - - sb = new StringBuilder(100000); - res = toStringBuilderDT(sb, this.dt); - dtDump = res.toString(); - - sb = new StringBuilder(100000); - res = toStringBuilderDTWithEmbeddedTrees(sb, this.dt); - dtDumpWithEmbeddedTrees = res.toString(); - - sb = new StringBuilder(100000); - res = toStringBuilderDTWithVerbNet(sb, this.dt); - dtDumpWithVerbNet = res.toString(); - } - // basic representation of discourse tree - private StringBuilder toStringBuilderDT(StringBuilder sb, DiscourseTree dt) { - if (dt.isTerminal()) { - if (dt.relationLabel() != null) { - sb.append(dt.relationLabel()); - //sb.append("("+dt.rawText()+")"); - scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100); - - dt.print(sbs, 0, false, true); - String text = sbs.replaceAllLiterally("Nucleus TEXT:", "("); - text = text.substring(0, text.length()-1)+")"; - sb.append(text); - } - return sb; - } else { - sb.append('('); - if (dt.relationLabel() != null) { - sb.append(dt.relationLabel()); - } - DiscourseTree[] kids = dt.children(); - if (kids != null) { - for (DiscourseTree kid : kids) { - sb.append(' '); - toStringBuilderDT(sb, kid); - } - } - return sb.append(')'); - } - } - - private StringBuilder toStringBuilderDTWithPOSSeq(StringBuilder sb, DiscourseTree dt) { - if (dt.isTerminal()) { - if (dt.relationLabel() != null && dt.relationLabel().length()>2) { - sb.append(dt.relationLabel()); - // different StrBuilder for trees from scala - scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100); - dt.print(sbs, 0, false, true); - String text = sbs.replaceAllLiterally("Nucleus TEXT:", ""); - //text = text.substring(0, text.length()-1)+""; - String textDump = substituteTextWithPOStext(text, this.getNodesThicket().get(dt.firstToken().copy$default$1())); - sb.append(textDump); - } - return sb; - } else { - sb.append('('); - if (dt.relationLabel() != null) { - sb.append(dt.relationLabel()); - } - DiscourseTree[] kids = dt.children(); - if (kids != null) { - for (DiscourseTree kid : kids) { - sb.append(' '); - toStringBuilderDTWithPOSSeq(sb, kid); - } - } - return sb.append(')'); - } - } - - private String substituteTextWithPOStext(String text, List<ParseTreeNode> list) { - boolean bMatch = false; - String[] tokens = text.split(" "); - for(int offset = 0; offset<list.size(); offset++ ){ - List<ParseTreeNode> subList = list.subList(offset, tokens.length+offset); - int count = 0; - bMatch = true; // if at least one mismatch - for(ParseTreeNode n: subList){ - if (!n.getWord().equals(tokens[count])){ - bMatch = false; - break; - } else - count++; - if (count>3) - break; - } - if (bMatch){ - return //"(" + - ParseTreeNode.toTreeRepresentationString(subList); // + ")"; - } - } - return null; - } - - private StringBuilder toStringBuilderDTWithEmbeddedTrees(StringBuilder sb, DiscourseTree dt) { - if (dt.isTerminal()) { - if (dt.relationLabel() != null && dt.relationLabel().length()>2) { - sb.append(dt.relationLabel()); - //sb.append("("+dt.rawText()+")"); - scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100); - - dt.print(sbs, 0, false, true); - String text = sbs.replaceAllLiterally("Nucleus TEXT:", ""); - //text = text.substring(0, text.length()-1)+""; - substituteTextWithParseTree(sb, text, this.getSentenceTrees().get(dt.firstToken().copy$default$1())); - } - return sb; - } else { - sb.append('('); - if (dt.relationLabel() != null) { - sb.append(dt.relationLabel()); - } - DiscourseTree[] kids = dt.children(); - if (kids != null) { - for (DiscourseTree kid : kids) { - sb.append(' '); - toStringBuilderDTWithEmbeddedTrees(sb, kid); - } - } - return sb.append(')'); - } - } - private void substituteTextWithParseTree(StringBuilder sb, String text, Tree sentenceTree) { - String[] tokens = text.split(" "); - List<Tree> foundTrees = null; - if (tokens.length>1){ - foundTrees = - extender.getASubtreeWithRootAsNodeForWord1(sentenceTree, sentenceTree, new String[]{tokens[0], tokens[1]}); - } - else{ - foundTrees = - extender.getASubtreeWithRootAsNodeForWord1(sentenceTree, sentenceTree, new String[]{tokens[0]}); - - } - - if (foundTrees == null || foundTrees.size()<1) - return; - - extender.toStringBuilder(sb, foundTrees.get(0)); - - } - - private StringBuilder toStringBuilderDTWithVerbNet(StringBuilder sb, DiscourseTree dt) { - if (dt.isTerminal()) { - if (dt.relationLabel() != null && dt.relationLabel().length()>2) { - sb.append(dt.relationLabel()); - //sb.append("("+dt.rawText()+")"); - scala.collection.mutable.StringBuilder sbs = new scala.collection.mutable.StringBuilder(100); - - dt.print(sbs, 0, false, true); - String text = sbs.replaceAllLiterally("Nucleus TEXT:", ""); - String textDump = null; - if (text.split(" ").length<100) // if not TOO long, more informative substitution, including VerbNets - textDump = substituteTextWithPOStextVerbNet(text, this.getNodesThicket().get(dt.firstToken().copy$default$1())); - else // otherwise just lemma-POS chains - textDump = substituteTextWithPOStext(text, this.getNodesThicket().get(dt.firstToken().copy$default$1())); - - - sb.append(textDump); - } - return sb; - } else { - sb.append('('); - if (dt.relationLabel() != null) { - sb.append(dt.relationLabel()); - } - DiscourseTree[] kids = dt.children(); - if (kids != null) { - for (DiscourseTree kid : kids) { - sb.append(' '); - toStringBuilderDTWithVerbNet(sb, kid); - } - } - return sb.append(')'); - } - } - - // substitutes lemma-POS pair instead of just lemma - // in case of verb provides moe detailed info - private String substituteTextWithPOStextVerbNet(String text, List<ParseTreeNode> list) { - boolean bMatch = false; - String[] tokens = text.split(" "); - for(int offset = 0; offset<list.size(); offset++ ){ - List<ParseTreeNode> subList = list.subList(offset, tokens.length+offset); - int count = 0; - bMatch = true; // if at least one mismatch - for(ParseTreeNode n: subList){ - if (!n.getWord().equals(tokens[count])){ - bMatch = false; - break; - } else - count++; - if (count>3) // three tokens is enough for alignment - break; - } - // alignment found; now - if (bMatch){ - StringBuilder buf = new StringBuilder(); - for(ParseTreeNode ch: subList){ - try { - if (ch.getPos().startsWith(".") || ch.getPos().startsWith(",") || ch.getPos().startsWith(";") || ch.getPos().startsWith("!")) - continue; - if (ch.getPos().startsWith("VB") && ch.getNormalizedWord()!=null){ // do more info for verbs - StringBuilder verbRepr = verbBuilder. - buildTreeRepresentationForTreeKernelLearning(ch.getNormalizedWord()); - if (verbRepr!=null) - buf.append(" ("+verbRepr+") "); - else - buf.append( "("+ch.getWord()+ " " + ch.getPos() + ")" ); - } else { // other than verb - buf.append( "("+ch.getWord()+ " " + ch.getPos() + ")" ); - } - } catch (Exception e) { - e.printStackTrace(); - } - } - return buf.toString().trim(); - } - } - return null; - } - - public String getDtDump() { - return this.dtDump; - } - public String getDtDumpWithPOS() { - return this.dtDumpWithPOS; - } - - public String getDtDumpWithEmbeddedTrees() { - return this.dtDumpWithEmbeddedTrees; - } - - public String getDtDumpWithVerbNet() { - return this.dtDumpWithVerbNet; - } -} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java deleted file mode 100644 index 61e8f13..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/external_rst/RstNode.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.tools.parse_thicket.external_rst; - -import org.apache.commons.lang.StringUtils; - -import opennlp.tools.similarity.apps.utils.Pair; - -public class RstNode { - Boolean isNucleus; - Pair<Integer, Integer> span; - Integer leaf; - String rel2par; - String text; - Integer level; - - public Boolean getIsNucleus() { - return isNucleus; - } - public void setIsNucleus(Boolean isNucleus) { - this.isNucleus = isNucleus; - } - public Pair<Integer, Integer> getSpan() { - return span; - } - public void setSpan(Pair<Integer, Integer> span) { - this.span = span; - } - public Integer getLeaf() { - return leaf; - } - public void setLeaf(Integer leaf) { - this.leaf = leaf; - } - public String getRel2par() { - return rel2par; - } - public void setRel2par(String rel2par) { - this.rel2par = rel2par; - } - public String getText() { - return text; - } - public void setText(String text) { - this.text = text; - } - - public String toString() { - String ret = ""; - if (isNucleus!=null && isNucleus) - ret+="Nucleus "; - if (span!=null) - ret+="["+span.getFirst()+" "+ span.getSecond()+"]"; - ret += " >> "+ rel2par; - if (text!=null) - ret+= " >> "+text; - return ret; - } - public RstNode(String line) { - if (StringUtils.trim(line).startsWith(")")) - return; - - - level = line.indexOf("("); - line = line.substring(line.indexOf("(")+2); - - isNucleus = line.substring(0, line.indexOf("(")).indexOf("Nucleus")>-1; - line = line.substring(line.indexOf("(")+1); - if (line.startsWith("span")){ - line = line.substring(5); - try { - span = new Pair<Integer, Integer>(); - String[] spanStr = line.substring(0, line.indexOf(")")).split(" "); - span.setFirst(Integer.parseInt(spanStr[0])); - span.setSecond(Integer.parseInt(spanStr[1])); - } catch (Exception e) { - e.printStackTrace(); - } - - } else if (line.startsWith("leaf")){ - try { - String leafStr = line.substring(5, line.indexOf(")")); - leaf = Integer.parseInt(leafStr); - } catch (Exception e) { - e.printStackTrace(); - } - - } else System.err.println("Problem parsing RST results: '"+line); - - line = line.substring(line.indexOf("rel2par")+8); - rel2par = line.substring(0, line.indexOf(")")).trim(); - - text = StringUtils.substringBetween(line, "_!", "_!)"); - - - } - - public static void main(String[] args){ - RstNode n1 = new RstNode(" ( Nucleus (leaf 7) (rel2par span) (text _!that it usually takes a day_!) )"), - n2 = new RstNode(" )"), - n3 = new RstNode(" ( Satellite (span 15 16) (rel2par Explanation)"); - - } - -} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java deleted file mode 100644 index b41cd46..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/BracesProcessor.java +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.parse_thicket.kernel_interface; - -import java.io.File; -import java.io.IOException; -import java.util.Stack; - -//import org.apache.commons.io.FileUtils; - -public class BracesProcessor { - private static final int MIN_BRACES_CNT = 5; - - private static final char L_PAREN = '('; - private static final char R_PAREN = ')'; - private static final char L_BRACE = '{'; - private static final char R_BRACE = '}'; - private static final char L_BRACKET = '['; - private static final char R_BRACKET = ']'; - private Stack<Character> stackIncremental = new Stack<Character>(); - private int count = 0; - private Boolean balancedSoFar = true; - - public Boolean getBalancedBracesResult(){ - if (balancedSoFar) - return (stackIncremental.isEmpty() && count> MIN_BRACES_CNT); - else - return false; - } - - public void analyzeBalancedBracesAddPortionIncremental(String s) { - - - for (int i = 0; i < s.length(); i++) { - - if (s.charAt(i) == L_PAREN) { - stackIncremental.push(L_PAREN); - count++; - - } - - else if (s.charAt(i) == L_BRACE) { - stackIncremental.push(L_BRACE); - count++; - } - - else if (s.charAt(i) == L_BRACKET){ - stackIncremental.push(L_BRACKET); - count++; - } - - else if (s.charAt(i) == R_PAREN) { - if (stackIncremental.isEmpty()) balancedSoFar = false; - if (stackIncremental.pop() != L_PAREN) balancedSoFar = false; - } - - else if (s.charAt(i) == R_BRACE) { - if (stackIncremental.isEmpty()) balancedSoFar = false; - if (stackIncremental.pop() != L_BRACE) balancedSoFar = false; - } - - else if (s.charAt(i) == R_BRACKET) { - if (stackIncremental.isEmpty()) balancedSoFar = false; - if (stackIncremental.pop() != L_BRACKET) balancedSoFar = false; - } - - // ignore all other characters - - } - - } - - public static boolean isBalanced(String s) { - int count = 0; - Stack<Character> stack = new Stack<Character>(); - for (int i = 0; i < s.length(); i++) { - - if (s.charAt(i) == L_PAREN) { - stack.push(L_PAREN); - count++; - - } - - else if (s.charAt(i) == L_BRACE) { - stack.push(L_BRACE); - count++; - } - - else if (s.charAt(i) == L_BRACKET){ - stack.push(L_BRACKET); - count++; - } - - else if (s.charAt(i) == R_PAREN) { - if (stack.isEmpty()) return false; - if (stack.pop() != L_PAREN) return false; - } - - else if (s.charAt(i) == R_BRACE) { - if (stack.isEmpty()) return false; - if (stack.pop() != L_BRACE) return false; - } - - else if (s.charAt(i) == R_BRACKET) { - if (stack.isEmpty()) return false; - if (stack.pop() != L_BRACKET) return false; - } - - // ignore all other characters - - } - return (stack.isEmpty()); - } - - public static boolean checkParentesis(String str) - { - if (str.isEmpty()) - return true; - - Stack<Character> stack = new Stack<Character>(); - for (int i = 0; i < str.length(); i++) - { - char current = str.charAt(i); - if (current == '{' || current == '(' || current == '[') - { - stack.push(current); - } - - - if (current == '}' || current == ')' || current == ']') - { - if (stack.isEmpty()) - return false; - - char last = stack.peek(); - if (current == '}' && (last == '{' || current == ')') - && last == '(' || (current == ']' - && last == '[')) - stack.pop(); - else - return false; - } - - } - - return stack.isEmpty(); - } - - public static boolean isParenthesisMatch(String str) { - Stack<Character> stack = new Stack<Character>(); - - char c; - for(int i=0; i < str.length(); i++) { - c = str.charAt(i); - - if(c == '{') - return false; - - if(c == '(') - stack.push(c); - - if(c == '{') { - stack.push(c); - if(c == '}') - if(stack.empty()) - return false; - else if(stack.peek() == '{') - stack.pop(); - } - else if(c == ')') - if(stack.empty()) - return false; - else if(stack.peek() == '(') - stack.pop(); - else - return false; - } - return stack.empty(); - } - - -} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java deleted file mode 100644 index 1c07719..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/DescriptiveParagraphFromDocExtractor.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.parse_thicket.kernel_interface; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.tika.Tika; -import org.apache.tika.exception.TikaException; - -public class DescriptiveParagraphFromDocExtractor { - protected static Tika tika = new Tika(); - private static int MIN_PARA_LENGTH = 200, //120, - MIN_NUM_WORDS=15, - MAX_PARA_LENGTH = 500, //200 - TEXT_PORTION_FOR_ANALYSIS = 20000, - MAX_PARA_OUTPUT=20; - public static String getFirstParagraphFromFile(File f) { - - String text = ""; - try { - try { - text = tika.parseToString(f); - } catch (TikaException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - //text = FileUtils.readFileToString(f, null); - if (text.length()>TEXT_PORTION_FOR_ANALYSIS) - text = text.substring(0, TEXT_PORTION_FOR_ANALYSIS); - float avgSentSizeThr = (float)MIN_PARA_LENGTH/4f; //2f - String[] portions = text.split("\\.\\n"); - for(String p: portions){ - float avgSentSize = (float)p.length()/(float)p.split("\\n\\n").length; - - if (p.length()> MIN_PARA_LENGTH && p.split(" ").length>MIN_NUM_WORDS && - avgSentSize > avgSentSizeThr && p.length() < MAX_PARA_LENGTH){ - return normalizePara(p); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - // if not a suitable paragraph found, return the whole text - if (text.length()>150) - text = text.substring(0, 150); - return text; - } - - public static List<String> getLongParagraphsFromFile(File f) { - List<String> results = new ArrayList<String>(); - String text = ""; - try { - try { - text = tika.parseToString(f); - } catch (TikaException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - //text = FileUtils.readFileToString(f, null); - if (text.length()>TEXT_PORTION_FOR_ANALYSIS) - text = text.substring(0, TEXT_PORTION_FOR_ANALYSIS); - float avgSentSizeThr = (float)MIN_PARA_LENGTH/4f; //2f - String[] portions = text.split("\\.\\n"); - if (portions.length<2) - portions = text.split("\\n\\n"); - if (portions.length<2) - portions = text.split("\\n \\n"); - if (portions.length<2){ - String[] sentences = text.replace('.','&').split(" & "); - List<String> portionsLst = new ArrayList<String>(); - int totalChars = 0; - String buffer = ""; - for(String sent: sentences){ - totalChars+=sent.length(); - if (totalChars>MAX_PARA_LENGTH){ - portionsLst.add(buffer); - buffer=""; - totalChars = 0; - } else { - buffer+= sent + ". "; - } - } - portions = portionsLst.toArray(new String[0]); - } - for(String p: portions){ - try { - float avgSentSize = (float)p.length()/(float)p.split("\\n\\n").length; - - if (p.length()> MIN_PARA_LENGTH && p.split(" ").length>MIN_NUM_WORDS && - avgSentSize > avgSentSizeThr) { - if (p.length() < MAX_PARA_LENGTH){ - results.add(normalizePara(p)); - } - else { // reduce length to the latest '.' in substring - - String pReduced = p; - if (p.length()>= MAX_PARA_LENGTH+80) - pReduced = p.substring(0, MAX_PARA_LENGTH+80); - int indexPeriod = pReduced.lastIndexOf('.'); - if (indexPeriod>-1){ - pReduced = pReduced.substring(0, indexPeriod); - } - results.add(normalizePara(pReduced)); - } - if (results.size()>MAX_PARA_OUTPUT) - break; - } - } catch (Exception e) { - e.printStackTrace(); - } - } - if (results.size()<1){ - if (text.length()>= MAX_PARA_LENGTH+80) - text = text.substring(0, MAX_PARA_LENGTH+80); - results.add(text); - } - - } catch (IOException e) { - e.printStackTrace(); - } - if (results.size()<1){ - System.err.println("Failed to extract text from "+f.getName()); - } - - return results; - } - - private static String normalizePara(String p){ - p = p.replaceAll("\\n", " ").replaceAll("\\.\\.", " ").replaceAll(" ", " "); - p = p.replaceAll("[^A-Za-z0-9 _\\.,\\!]", ""); - return p; - } - - public static void main(String args[]){ - List<String> results = getLongParagraphsFromFile(new File( - "/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/design_doc_posNeg/pos/2IP40 Detail Design Document.pdf" - //+ " Online Screening Tool - Delottie.pdf" - )); - System.out.println(results); - - String res = getFirstParagraphFromFile(new File("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/" - + "design_doc/2004Schalk_BCI2000Implementation.pdf")); - System.out.println(res); - results = getLongParagraphsFromFile(new File("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/1k/" - + "design_doc/2004Schalk_BCI2000Implementation.pdf")); - System.out.println(results); - - } -} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java deleted file mode 100644 index c568035..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.tools.parse_thicket.kernel_interface; - -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.logging.Logger; - -import edu.stanford.nlp.trees.Tree; -import edu.stanford.nlp.util.StringUtils; - - -import opennlp.tools.jsmlearning.ProfileReaderWriter; -import opennlp.tools.parse_thicket.ParseThicket; -import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor; -import opennlp.tools.parse_thicket.apps.SnippetToParagraph; -import opennlp.tools.parse_thicket.apps.WebPageContentSentenceExtractor; -import opennlp.tools.parse_thicket.matching.Matcher; -import opennlp.tools.similarity.apps.BingQueryRunner; -import opennlp.tools.similarity.apps.HitBase; -import opennlp.tools.similarity.apps.HitBaseComparable; -import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper; -import opennlp.tools.textsimilarity.ParseTreeChunk; -import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; -import opennlp.tools.textsimilarity.SentencePairMatchResult; -import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; - -public class MultiSentenceExtendedForestSearchResultsProcessorSetFormer extends MultiSentenceKernelBasedSearchResultsProcessor{ - private static Logger LOG = Logger - .getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedExtendedForestSearchResultsProcessor"); - protected TreeExtenderByAnotherLinkedTree treeExtender = new TreeExtenderByAnotherLinkedTree(); - - private TreeKernelRunner tkRunner = new TreeKernelRunner(); - - protected static final String modelFileName = "model.txt"; - - private static final String trainingFileName = "training.txt"; - - protected static final String unknownToBeClassified = "unknown.txt"; - - private static final String classifierOutput = "classifier_output.txt"; - - private String path; - public void setKernelPath (String path){ - this.path=path; - } - - WebPageContentSentenceExtractor extractor = new WebPageContentSentenceExtractor(); - - private List<HitBase> formTreeForestDataSet( - List<HitBase> hits, String query, boolean isPositive) { - List<HitBase> newHitList = new ArrayList<HitBase>(), newHitListReRanked = new ArrayList<HitBase>(); - // form the training set from original documents. Since search results are ranked, we set the first half as positive set, - //and the second half as negative set. - // after re-classification, being re-ranked, the search results might end up in a different set - List<String[]> treeBankBuffer = new ArrayList<String[]>(); - int count = 0; - for (HitBase hit : hits) { - count++; - // if orig content has been already set in HIT object, ok; otherwise set it - String searchResultText = hit.getPageContent(); - if (searchResultText ==null){ - try { - HitBase hitWithFullSents = extractor.formTextFromOriginalPageGivenSnippet(hit); - for(String paragraph: hitWithFullSents.getOriginalSentences()){ - List<String[]> res = formTreeKernelStructure(paragraph, count, hits, isPositive); - for(String[] rl : res){ - StringUtils.printToFile(new File(path+trainingFileName), rl[0]+" \n", true); - } - //treeBankBuffer.addAll(res); - } - } catch (Exception e) { - e.printStackTrace(); - } - - } - newHitList.add(hit); - - - } - // write the lits of samples to a file - ProfileReaderWriter.appendReport(treeBankBuffer, path+trainingFileName, ' '); - return newHitList; - - } - - protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits, boolean isPositive) { - List<String[]> treeBankBuffer = new ArrayList<String[]> (); - try { - // get the parses from original documents, and form the training dataset - ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText); - List<Tree> forest = pt.getSentences(); - // if from the first half or ranked docs, then positive, otherwise negative - String posOrNeg = null; - if (isPositive) - posOrNeg=" 1 "; - else - posOrNeg=" -1 "; - // form the list of training samples - for(Tree t: forest){ - treeBankBuffer.add(new String[] {posOrNeg+" |BT| "+t.toString()+ " |ET|"}); - } - } catch (Exception e) { - e.printStackTrace(); - } - return treeBankBuffer; - } - - public List<HitBase> runSearchViaAPI(String query, Boolean isPositive) { - - try { - List<HitBase> hits = bingSearcher.runSearch(query, 20, true); - formTreeForestDataSet(hits, query, isPositive); - - } catch (Exception e) { - e.printStackTrace(); - LOG.info("No search results for query '" + query); - return null; - } - - - return null; - } - public static void main(String[] args){ - String query = "digital camera for my mother as a gift"; - Boolean isPositive = true; - if (args!=null && args.length>0){ - query = args[0]; - if (args.length>1 && args[1]!=null && args[1].startsWith("neg")) - isPositive = false; - } - - MultiSentenceExtendedForestSearchResultsProcessorSetFormer proc = new MultiSentenceExtendedForestSearchResultsProcessorSetFormer(); - proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel_big\\"); - proc.runSearchViaAPI(query, isPositive); - } - -} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java deleted file mode 100644 index 1b2790f..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.tools.parse_thicket.kernel_interface; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.logging.Logger; - -import edu.stanford.nlp.trees.Tree; - - -import opennlp.tools.jsmlearning.ProfileReaderWriter; -import opennlp.tools.parse_thicket.ParseThicket; -import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor; -import opennlp.tools.parse_thicket.apps.SnippetToParagraph; -import opennlp.tools.parse_thicket.matching.Matcher; -import opennlp.tools.similarity.apps.BingQueryRunner; -import opennlp.tools.similarity.apps.HitBase; -import opennlp.tools.similarity.apps.HitBaseComparable; -import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper; -import opennlp.tools.textsimilarity.ParseTreeChunk; -import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; -import opennlp.tools.textsimilarity.SentencePairMatchResult; -import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; - -public class MultiSentenceKernelBasedExtendedForestSearchResultsProcessor extends MultiSentenceKernelBasedSearchResultsProcessor{ - private static Logger LOG = Logger - .getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedExtendedForestSearchResultsProcessor"); - protected TreeExtenderByAnotherLinkedTree treeExtender = new TreeExtenderByAnotherLinkedTree(); - - - - - protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits) { - List<String[]> treeBankBuffer = new ArrayList<String[]> (); - try { - // get the parses from original documents, and form the training dataset - ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText); - List<String> extendedTreesDump = treeExtender.buildForestForCorefArcs(pt); - // if from the first half or ranked docs, then positive, otherwise negative - String posOrNeg = null; - if (count<hits.size()/2) - posOrNeg=" 1 "; - else - posOrNeg=" -1 "; - // form the list of training samples - for(String t: extendedTreesDump){ - treeBankBuffer.add(new String[] {posOrNeg+" |BT| "+t+ " |ET|"}); - } - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return treeBankBuffer; - } - - public static void main(String[] args){ - String query = null; - - /*" I see no meaningful distinction between complacency or complicity in the military's latest failure to uphold their own " + - "standards of conduct. Nor do I see a distinction between the service member who orchestrated this offense and the chain of " + - "command that was either oblivious to or tolerant of criminal behavior"; - - query = "I am now living abroad and have health insurance from Russia. How can I avoid penalty for not having health insurance in US"; - - query = "ECUADOR'S PRESIDENT RAFAEL CORREA SAYS U.S. VP JOE BIDEN WANTS HIM TO REFUSE WHISTLEBLOWER EDWARD SNOWDEN'S BID FOR ASYLUM"; - query = "how to pay tax on foreign income from real estate"; - */ - if (args!=null && args.length>0) - query = args[0]; - - MultiSentenceKernelBasedExtendedForestSearchResultsProcessor proc = new MultiSentenceKernelBasedExtendedForestSearchResultsProcessor(); - proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel\\"); - proc.runSearchViaAPI(query); - } - -} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java deleted file mode 100644 index 39d348e..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package opennlp.tools.parse_thicket.kernel_interface; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.logging.Logger; - -import edu.stanford.nlp.trees.Tree; - - -import opennlp.tools.jsmlearning.ProfileReaderWriter; -import opennlp.tools.parse_thicket.ParseThicket; -import opennlp.tools.parse_thicket.apps.BingQueryRunnerMultipageSearchResults; -import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor; -import opennlp.tools.parse_thicket.apps.SnippetToParagraph; -import opennlp.tools.parse_thicket.matching.Matcher; -import opennlp.tools.similarity.apps.BingQueryRunner; -import opennlp.tools.similarity.apps.HitBase; -import opennlp.tools.similarity.apps.HitBaseComparable; -import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper; -import opennlp.tools.textsimilarity.ParseTreeChunk; -import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; -import opennlp.tools.textsimilarity.SentencePairMatchResult; -import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; - -public class MultiSentenceKernelBasedSearchResultsProcessor extends MultiSentenceSearchResultsProcessor{ - private static Logger LOG = Logger - .getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedSearchResultsProcessor"); - - private WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper(); - protected Matcher matcher = new Matcher(); - private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); - protected BingQueryRunnerMultipageSearchResults bingSearcher = new BingQueryRunnerMultipageSearchResults(); - private SnippetToParagraph snp = new SnippetToParagraph(); - private TreeKernelRunner tkRunner = new TreeKernelRunner(); - - private String path; - public void setKernelPath (String path){ - this.path=path; - } - protected static final String modelFileName = "model.txt"; - - private static final String trainingFileName = "training.txt"; - - protected static final String unknownToBeClassified = "unknown.txt"; - - private static final String classifierOutput = "classifier_output.txt"; - - - public List<HitBase> runSearchViaAPI(String query) { - List<HitBase> hits = null; - try { - List<HitBase> resultList = bingSearcher.runSearch(query); - // now we apply our own relevance filter - //hits = calculateMatchScoreResortHits(resultList, query); - - hits = resultList; - //once we applied our re-ranking, we set highly ranked as positive set, low-rated as negative set - //and classify all these search results again - //training set is formed from original documents for the search results, - // and snippets of these search results are classified - hits = filterOutIrrelevantHitsByTreeKernelLearning(hits, query); - - } catch (Exception e) { - e.printStackTrace(); - LOG.info("No search results for query '" + query); - return null; - } - - - return hits; - } - - private List<HitBase> filterOutIrrelevantHitsByTreeKernelLearning( - List<HitBase> hits, String query) { - List<HitBase> newHitList = new ArrayList<HitBase>(), newHitListReRanked = new ArrayList<HitBase>(); - // form the training set from original documents. Since search results are ranked, we set the first half as positive set, - //and the second half as negative set. - // after re-classification, being re-ranked, the search results might end up in a different set - List<String[]> treeBankBuffer = new ArrayList<String[]>(); - int count = 0; - for (HitBase hit : hits) { - count++; - // if orig content has been already set in HIT object, ok; otherwise set it - String searchResultText = hit.getPageContent(); - if (searchResultText ==null){ - String[] pageSentsAndSnippet = formTextForReRankingFromHit(hit); - searchResultText = pageSentsAndSnippet[0]; - hit.setPageContent(searchResultText); - } - newHitList.add(hit); - treeBankBuffer.addAll(formTreeKernelStructure(searchResultText, count, hits)); - - } - // write the lits of samples to a file - ProfileReaderWriter.writeReport(treeBankBuffer, path+trainingFileName, ' '); - // build the model - tkRunner.runLearner(path, trainingFileName, modelFileName); - - // now we preparing the same answers to be classifies in/out - treeBankBuffer = new ArrayList<String[]>(); - for (HitBase hit : newHitList) { - // not original docs now but instead a snippet - String searchResultTextAbstr = hit.getAbstractText(); - String snippet = searchResultTextAbstr.replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ") - .replace("<b>", "").replace("</b>", ""); - snippet = snippet.replace("</B>", "").replace("<B>", "") - .replace("<br>", "").replace("</br>", "").replace("...", ". ") - .replace("|", " ").replace(">", " ").replace(". .", ". "); - snippet = hit.getTitle() + " " + snippet; - - ParseThicket pt = matcher.buildParseThicketFromTextWithRST(snippet); - //hit.getPageContent()); - List<Tree> forest = pt.getSentences(); - // we consider the snippet as a single sentence to be classified - if (forest.size()>0){ - treeBankBuffer.add(new String[] {"0 |BT| "+forest.get(0).toString()+ " |ET|"}); - newHitListReRanked .add(hit); - } - - } - // form a file from the snippets to be classified - ProfileReaderWriter.writeReport(treeBankBuffer, path+unknownToBeClassified, ' '); - tkRunner.runClassifier(path, unknownToBeClassified, modelFileName, classifierOutput); - // read classification results - List<String[]> classifResults = ProfileReaderWriter.readProfiles(path+classifierOutput, ' '); - // iterate through classification results and set them as scores for hits - newHitList = new ArrayList<HitBase>(); - for(int i=0; i<newHitListReRanked.size() && i<classifResults.size() ; i++){ - String scoreClassif = classifResults.get(i)[0]; - float val = Float.parseFloat(scoreClassif); - HitBase hit = newHitListReRanked.get(i); - hit.setGenerWithQueryScore((double) val); - newHitList.add(hit); - } - - // sort by SVM classification results - Collections.sort(newHitList, new HitBaseComparable()); - System.out.println("\n\n ============= NEW ORDER ================= "); - for (HitBase hit : newHitList) { - System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore()); - System.out.println("page content = "+hit.getPageContent()); - System.out.println("title = "+hit.getAbstractText()); - System.out.println("snippet = "+hit.getAbstractText()); - System.out.println("match = "+hit.getSource()); - } - - return newHitList; - - } - - protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits) { - List<String[]> treeBankBuffer = new ArrayList<String[]> (); - try { - // get the parses from original documents, and form the training dataset - ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText); - List<Tree> forest = pt.getSentences(); - // if from the first half or ranked docs, then positive, otherwise negative - String posOrNeg = null; - if (count<hits.size()/2) - posOrNeg=" 1 "; - else - posOrNeg=" -1 "; - // form the list of training samples - for(Tree t: forest){ - treeBankBuffer.add(new String[] {posOrNeg+" |BT| "+t.toString()+ " |ET|"}); - } - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return treeBankBuffer; - } - - public static void main(String[] args){ - String query = " I see no meaningful distinction between complacency or complicity in the military's latest failure to uphold their own " + - "standards of conduct. Nor do I see a distinction between the service member who orchestrated this offense and the chain of " + - "command that was either oblivious to or tolerant of criminal behavior"; - - query = "I am now living abroad and have health insurance from Russia. How can I avoid penalty for not having health insurance in US"; - - MultiSentenceKernelBasedSearchResultsProcessor proc = new MultiSentenceKernelBasedSearchResultsProcessor(); - proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel\\"); - proc.runSearchViaAPI(query); - } - -} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java deleted file mode 100644 index fb5eed8..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.parse_thicket.kernel_interface; - -import java.util.ArrayList; -import java.util.List; - -import edu.stanford.nlp.trees.Tree; - -import opennlp.tools.jsmlearning.ProfileReaderWriter; -import opennlp.tools.parse_thicket.ParseThicket; -import opennlp.tools.parse_thicket.matching.Matcher; -import opennlp.tools.textsimilarity.ParseTreeChunk; - -public class PT2ExtendedTreeForestBuilder { - private Matcher matcher = new Matcher(); - private TreeKernelRunner tkRunner = new TreeKernelRunner(); - private static final String modelFileName = "model.txt", - trainingFileName = "training.txt"; - - private List<String[]> formTrainingSetFromText(String para, boolean positive){ - String prefix = null; - if (positive) - prefix=" 1 "; - else - prefix=" -1 "; - - ParseThicket pt = matcher.buildParseThicketFromTextWithRST(para); - List<Tree> forest = pt.getSentences(); - List<String[]> treeBankBuffer = new ArrayList<String[]>(); - for(Tree t: forest){ - treeBankBuffer.add(new String[] {prefix+"|BT| "+t.toString()+ " |ET|"}); - } - return treeBankBuffer; - } - - private String formTrainingSetFromTextOneLine(String para, boolean positive){ - String prefix = null; - if (positive) - prefix=" 1 "; - else - prefix=" -1 "; - - ParseThicket pt = matcher.buildParseThicketFromTextWithRST(para); - List<Tree> forest = pt.getSentences(); - String line = prefix; - for(Tree t: forest){ - line+= "|BT| "+t.toString()+ " |ET| "; - } - return line; - } - - public void formPosNegTrainingSet(String pos, String neg, String path){ - List<String[]> list = formTrainingSetFromText(pos, true), - negList= formTrainingSetFromText(neg, false); - list.addAll(negList); - ProfileReaderWriter.writeReport(list, path+trainingFileName, ' '); - tkRunner.runLearner(path, trainingFileName, modelFileName); - } - - public void classifySentences(String sentences, String path){ - ParseThicket pt = matcher.buildParseThicketFromTextWithRST(sentences); - List<Tree> forest = pt.getSentences(); - List<String[]> treeBankBuffer = new ArrayList<String[]>(); - for(Tree t: forest){ - treeBankBuffer.add(new String[] {" 0 |BT| "+t.toString()+ " |ET|"}); - } - - ProfileReaderWriter.writeReport(treeBankBuffer, path+"unknown.txt", ' '); - tkRunner.runClassifier(path, "unknown.txt", modelFileName, "classifier_output.txt"); - } - - - public static void main(String[] args){ - - PT2ExtendedTreeForestBuilder builder = new PT2ExtendedTreeForestBuilder(); - - - String posSents = "Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+ - "UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " + - "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " + - "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "; - - String negSents = "Iran refuses the UN offer to end a conflict over its nuclear weapons."+ - "UN passes a resolution prohibiting Iran from developing its uranium enrichment site. " + - "A recent UN report presented charts saying Iran was working on nuclear weapons. " + - "Iran envoy to UN states its nuclear development is for peaceful purpose, and the evidence against its claim is fabricated by the US. "; - builder.formPosNegTrainingSet(posSents, negSents, "C:\\stanford-corenlp\\tree_kernel\\"); - - - builder.classifySentences("Iran refuses Iraq's offer to end its conflict with UN. Iran passes a resolution prohibiting UN from doing second" + - " uranium enrichment site. Envoy to US says its nuclear development is for peaceful purposes. Material evidence againt US has been fabricated by UN.", - - "C:\\stanford-corenlp\\tree_kernel\\"); - } -} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java deleted file mode 100644 index d6a295f..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.parse_thicket.kernel_interface; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.logging.Logger; - -import org.apache.commons.lang.StringUtils; - - -import opennlp.tools.parse_thicket.apps.MinedSentenceProcessor; -import opennlp.tools.parse_thicket.apps.SnippetToParagraph; -import opennlp.tools.similarity.apps.Fragment; -import opennlp.tools.similarity.apps.GeneratedSentenceProcessor; -import opennlp.tools.similarity.apps.HitBase; -import opennlp.tools.similarity.apps.RelatedSentenceFinder; -import opennlp.tools.similarity.apps.utils.PageFetcher; -import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; -import opennlp.tools.similarity.apps.utils.Utils; -import opennlp.tools.textsimilarity.TextProcessor; - - -public class SnippetToParagraphFull extends SnippetToParagraph { - private PageFetcher pFetcher = new PageFetcher(); - private static Logger LOG = Logger - .getLogger("com.become.parse_thicket.apps.SnippetToParagraphFull"); - - - - public HitBase formTextFromOriginalPageGivenSnippet(HitBase item) { - - String[] sents = extractSentencesFromPage(item.getUrl()); - - String title = item.getTitle().replace("<b>", " ").replace("</b>", " ") - .replace(" ", " ").replace(" ", " "); - // generation results for this sentence - List<String> result = new ArrayList<String>(); - // form plain text from snippet - String snapshot = item.getAbstractText().replace("<b>", " ") - .replace("</b>", " ").replace(" ", " ").replace(" ", " ").replace("\"", ""); - - String snapshotMarked = snapshot.replace(" ...", "."); - List<String> fragments = TextProcessor.splitToSentences(snapshotMarked); - if (fragments.size()<3 && StringUtils.countMatches(snapshotMarked, ".")>1){ - snapshotMarked = snapshotMarked.replace("..", "&").replace(".", "&"); - String[] fragmSents = snapshotMarked.split("&"); - fragments = Arrays.asList(fragmSents); - } - - for (String f : fragments) { - String followSent = null; - if (f.length() < 50) - continue; - String pageSentence = ""; - // try to find original sentence from webpage - - try { - String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment( - f, sents); - pageSentence = mainAndFollowSent[0]; - followSent = mainAndFollowSent[1]; - if (pageSentence!=null) - result.add(pageSentence); - else { - result.add(f); - LOG.info("Could not find the original sentence \n"+f +"\n in the page " ); - } - //if (followSent !=null) - // result.add(followSent); - } catch (Exception e) { - - e.printStackTrace(); - } - } - item.setOriginalSentences(result); - return item; - } - - -} - http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/2707f665/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java deleted file mode 100644 index c980f9f..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.parse_thicket.kernel_interface; - -import java.util.ArrayList; -import java.util.List; -import java.util.logging.Logger; - -import opennlp.tools.jsmlearning.ProfileReaderWriter; -import opennlp.tools.parse_thicket.ParseThicket; -import opennlp.tools.parse_thicket.ParseTreeNode; -import opennlp.tools.parse_thicket.VerbNetProcessor; -import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc; -import opennlp.tools.parse_thicket.matching.Matcher; -import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder; -import edu.stanford.nlp.trees.Tree; - -public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { - private static Logger log = Logger - .getLogger("opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree"); - - public List<String> buildForestForCorefArcs(ParseThicket pt){ - List<String> results = new ArrayList<String>(); - for(WordWordInterSentenceRelationArc arc: pt.getArcs()){ - //if (!arc.getArcType().getType().startsWith("coref")) - // continue; - int fromSent = arc.getCodeFrom().getFirst(); - int toSent = arc.getCodeTo().getFirst(); - if (fromSent <1 || toSent <1 ) // TODO problem in sentence enumeration => skip building extended trees - return results; - - String wordFrom = arc.getLemmaFrom(); - String wordTo = arc.getLemmaTo(); - - List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent-1), - pt.getSentences().get(fromSent-1), new String[]{ wordFrom}); - if (trees==null || trees.size()<1) - continue; - System.out.println(trees); - StringBuilder sb = new StringBuilder(10000); - toStringBuilderExtenderByAnotherLinkedTree1(sb, pt.getSentences().get(toSent-1), trees.get(0), new String[]{wordTo}); - System.out.println(sb.toString()); - results.add(sb.toString()); - } - // if no arcs then orig sentences - if (results.isEmpty()){ - for(Tree t: pt.getSentences()){ - results.add(t.toString()); - } - } - return results; - } - // sentences in pt are enumerarted starting from 0; - //this func works with Sista version of Stanford NLP and sentences are coded from 0 - public List<String> buildForestForRSTArcs(ParseThicket pt){ - List<String> results = new ArrayList<String>(); - for(WordWordInterSentenceRelationArc arc: pt.getArcs()){ - // TODO - uncomment - //if (!arc.getArcType().getType().startsWith("rst")) - // continue; - int fromSent = arc.getCodeFrom().getFirst(); - int toSent = arc.getCodeTo().getFirst(); - - String wordFrom = arc.getLemmaFrom(); - String wordTo = arc.getLemmaTo(); - - if (wordFrom == null || wordFrom.length()<1 || wordTo == null || wordTo.length()<1) - log.severe("Empty lemmas for RST arc "+ arc); - - List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent), - pt.getSentences().get(fromSent), new String[]{ wordFrom}); - if (trees==null || trees.size()<1) - continue; - System.out.println(trees); - StringBuilder sb = new StringBuilder(10000); - Tree tree = trees.get(0); - // instead of phrase type for the root of the tree, we want to put the RST relation name - if (arc.getArcType().getType().startsWith("rst")) - tree.setValue(arc.getArcType().getSubtype()); - - toStringBuilderExtenderByAnotherLinkedTree1(sb, pt.getSentences().get(toSent), tree, new String[]{wordTo}); - System.out.println(sb.toString()); - results.add(sb.toString()); - } - // if no arcs then orig sentences - if (results.isEmpty()){ - for(Tree t: pt.getSentences()){ - results.add(t.toString()); - } - } - return results; - } - - public StringBuilder toStringBuilderExtenderByAnotherLinkedTree1(StringBuilder sb, Tree t, Tree treeToInsert, String[] corefWords) { - if (t.isLeaf()) { - if (t.label() != null) { - sb.append(t.label().value()); - } - return sb; - } else { - sb.append('('); - if (t.label() != null) { - if (t.value() != null) { - sb.append(t.label().value()); - } - } - boolean bInsertNow=false; - Tree[] kids = t.children(); - if (kids != null) { - for (Tree kid : kids) { - if (corefWords!=null){ - String word = corefWords[corefWords.length-1]; - String phraseStr = kid.toString(); - phraseStr=phraseStr.replace(")", ""); - if (phraseStr.endsWith(word)){ - bInsertNow=true; - } - } - } - if (bInsertNow){ - for (Tree kid : kids) { - sb.append(' '); - toStringBuilderExtenderByAnotherLinkedTree1(sb, kid, null, null); - } - sb.append(' '); - toStringBuilderExtenderByAnotherLinkedTree1(sb, treeToInsert, null, null); - } else { - for (Tree kid : kids) { - sb.append(' '); - toStringBuilderExtenderByAnotherLinkedTree1(sb, kid, treeToInsert, corefWords); - } - - } - } - - return sb.append(')'); - } - } - - // given a parse tree and a - public List<Tree> getASubtreeWithRootAsNodeForWord1(Tree tree, Tree currentSubTree, String[] corefWords){ - if (currentSubTree.isLeaf()){ - return null; - } - List<Tree> result = null; - Tree[] kids = currentSubTree.children(); - if (kids != null) { - boolean bFound=false; - String word = corefWords[corefWords.length-1]; - for (Tree kid : kids) { - if (bFound){ - result.add(kid); - } else { - String phraseStr = kid.toString(); - phraseStr=phraseStr.replace(")", ""); - if (phraseStr.endsWith(word)){ // found - bFound=true; - result = new ArrayList<Tree>(); - } - } - } - if (bFound){ - return result; - } - // if not a selected node, proceed with iteration - for (Tree kid : kids) { - List<Tree> ts = getASubtreeWithRootAsNodeForWord1(tree, kid, corefWords); - if (ts!=null) - return ts; - } - - } - return null; - } - - // now obsolete - public Tree[] getASubtreeWithRootAsNodeForWord(Tree tree, Tree currentSubTree, String[] corefWords){ - if (currentSubTree.isLeaf()){ - return null; - } - - - boolean bInsertNow=false; - /*List<ParseTreeNode> bigTreeNodes = parsePhrase(currentSubTree.label().value()); - for(ParseTreeNode smallNode: bigTreeNodes ){ - if (bigTreeNodes.get(0).getWord().equals("") ) - continue; - String word = bigTreeNodes.get(0).getWord(); - for(String cWord: corefWords){ - - if (word.equalsIgnoreCase(cWord)) - bInsertNow=true; - } - } */ - - String nodePhraseStr = currentSubTree.toString(); - System.out.println(nodePhraseStr); - for(String w: corefWords) - nodePhraseStr = nodePhraseStr.replace(w, ""); - // all words are covered - if (nodePhraseStr.toUpperCase().equals(nodePhraseStr)) - bInsertNow=true; - - //if(bInsertNow) - // return currentSubTree; - - Tree[] kids = currentSubTree.children(); - if (kids != null) { - /*for (Tree kid : kids) { - List<ParseTreeNode> bigTreeNodes = parsePhrase(kid.label().value()); - if (bigTreeNodes!=null && bigTreeNodes.size()>0 && bigTreeNodes.get(0)!=null && - bigTreeNodes.get(0).getWord().equalsIgnoreCase(corefWords[0])){ - bInsertNow=true; - return kids; - } - - }*/ - - - for (Tree kid : kids) { - Tree[] t = getASubtreeWithRootAsNodeForWord(tree, kid, corefWords); - if (t!=null) - return t; - } - - } - return null; - } - - - public StringBuilder toStringBuilderExtenderByAnotherLinkedTree(StringBuilder sb, Tree t, Tree treeToInsert) { - if (t.isLeaf()) { - if (t.label() != null) { - sb.append(t.label().value()); - } - return sb; - } else { - sb.append('('); - if (t.label() != null) { - if (t.value() != null) { - sb.append(t.label().value()); - } - } - - boolean bInsertNow=false; - // we try match trees to find out if we are at the insertion position - if (treeToInsert!=null){ - List<ParseTreeNode> bigTreeNodes = parsePhrase(t.label().value()); - List<ParseTreeNode> smallTreeNodes = parsePhrase(treeToInsert.getChild(0).getChild(0).getChild(0).label().value()); - - System.out.println(t + " \n "+ treeToInsert+ "\n"); - - if (smallTreeNodes.size()>0 && bigTreeNodes.size()>0) - for(ParseTreeNode smallNode: smallTreeNodes ){ - if (!bigTreeNodes.get(0).getWord().equals("") - && bigTreeNodes.get(0).getWord().equalsIgnoreCase(smallNode.getWord())) - bInsertNow=true; - } - } - - if (bInsertNow){ - Tree[] kids = t.children(); - if (kids != null) { - for (Tree kid : kids) { - sb.append(' '); - toStringBuilderExtenderByAnotherLinkedTree(sb, kid, null); - } - sb.append(' '); - toStringBuilderExtenderByAnotherLinkedTree(sb, treeToInsert.getChild(0).getChild(1), null); - int z=0; z++; - } - } else { - Tree[] kids = t.children(); - if (kids != null) { - for (Tree kid : kids) { - sb.append(' '); - toStringBuilderExtenderByAnotherLinkedTree(sb, kid, treeToInsert); - } - - } - } - return sb.append(')'); - } - } - - public StringBuilder toStringBuilder(StringBuilder sb, Tree t) { - if (t.isLeaf()) { - if (t.label() != null) { - sb.append(t.label().value()); - } - return sb; - } else { - sb.append('('); - if (t.label() != null) { - if (t.value() != null) { - sb.append(t.label().value()); - } - } - Tree[] kids = t.children(); - if (kids != null) { - for (Tree kid : kids) { - sb.append(' '); - toStringBuilder(sb, kid); - } - } - return sb.append(')'); - } - } - - public static void main(String[] args){ - VerbNetProcessor p = VerbNetProcessor. - getInstance("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources"); - - Matcher matcher = new Matcher(); - TreeExtenderByAnotherLinkedTree extender = new TreeExtenderByAnotherLinkedTree(); - - ParseThicket pt = matcher.buildParseThicketFromTextWithRST(//"I went to the forest to look for a tree. I found out that it was thick and green"); - "Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons. "+ - "UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " + - "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " + - "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "); - - List<String> results = extender.buildForestForCorefArcs(pt); - System.out.println(results); - //System.exit(0); - - List<Tree> forest = pt.getSentences(); - - List<Tree> trees = extender.getASubtreeWithRootAsNodeForWord1(forest.get(1), forest.get(1), new String[]{"its"}); - System.out.println(trees); - StringBuilder sb = new StringBuilder(10000); - extender.toStringBuilderExtenderByAnotherLinkedTree1(sb, forest.get(0), trees.get(0), new String[]{"the", "forest"}); - System.out.println(sb.toString()); - - - // - //extender.toStringBuilderExtenderByAnotherLinkedTree(sb, forest.get(0), forest.get(1)); - //System.out.println(sb.toString()); - } -}
