[47/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

bgalitsky Wed, 16 Nov 2016 01:11:40 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifier.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifier.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifier.java
new file mode 100644
index 0000000..d1c80ad
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifier.java
@@ -0,0 +1,272 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.kernel_interface;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.apache.commons.io.FileUtils;
+
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;
+import opennlp.tools.parse_thicket.matching.Matcher;
+
+public class TreeKernelBasedClassifier {
+       protected static Logger LOG = Logger
+                       
.getLogger("opennlp.tools.similarity.apps.TreeKernelBasedClassifier");
+       protected ArrayList<File> queuePos = new ArrayList<File>(), queueNeg = 
new ArrayList<File>();
+  
+       protected Matcher matcher = new Matcher();
+       protected TreeKernelRunner tkRunner = new TreeKernelRunner();
+       protected TreeExtenderByAnotherLinkedTree treeExtender = new 
TreeExtenderByAnotherLinkedTree();
+
+
+       protected String path;
+       public void setKernelPath (String path){
+               this.path=path;
+       }
+       protected static final String modelFileName = "model.txt";
+
+       protected static final String trainingFileName = "training.txt";
+
+       protected static final String unknownToBeClassified = "unknown.txt";
+
+       protected static final String classifierOutput = 
"classifier_output.txt";
+       protected static final Float MIN_SVM_SCORE_TOBE_IN = 0.2f;
+       
+       /* main entry point to SVM TK classifier
+     * gets a file, reads it outside of CI, extracts longer paragraphs and 
builds parse thickets for them.
+     * Then parse thicket dump is processed by svm_classify
+     */
+       public Boolean classifyText(File f){
+               FileUtils.deleteQuietly(new File(path+unknownToBeClassified)); 
+               if (!(new File(path+modelFileName).exists())){
+                       LOG.severe("Model file '" +modelFileName + "'is absent: 
skip SVM classification");
+                       return null;
+               }
+               Map<Integer, Integer> countObject = new HashMap<Integer, 
Integer>(); 
+               int itemCount=0, objectCount = 0;
+               List<String> treeBankBuffer = new ArrayList<String>();  
+               List<String> 
texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
+               List<String> lines = 
formTreeKernelStructuresMultiplePara(texts, "0");
+               for(String l: lines){
+                       countObject.put(itemCount, objectCount);
+                       itemCount++;
+               }
+               objectCount++;
+               treeBankBuffer.addAll(lines);           
+
+               // write the lists of samples to a file
+               try {
+                       FileUtils.writeLines(new 
File(path+unknownToBeClassified), null, treeBankBuffer);
+               } catch (IOException e) {
+                       LOG.severe("Problem creating parse thicket files '"+ 
path+unknownToBeClassified + "' to be classified\n"+ e.getMessage() );
+               }
+
+               tkRunner.runClassifier(path, unknownToBeClassified, 
modelFileName, classifierOutput);
+               // read classification results
+               List<String[]> classifResults = 
ProfileReaderWriter.readProfiles(path+classifierOutput, ' ');
+
+
+               itemCount=0; objectCount = 0;
+               int currentItemCount=0;
+               float accum = 0;
+               LOG.info("\nsvm scores per paragraph: " );
+               for(String[] line: classifResults){
+                       Float val = Float.parseFloat(line[0]);
+                       System.out.print(val+" ");
+                       accum+=val;
+                       currentItemCount++;
+               }
+
+               float averaged = accum/(float)currentItemCount;
+               LOG.info("\n average = "+averaged);
+               currentItemCount=0;
+               Boolean in = false;
+               if (averaged> MIN_SVM_SCORE_TOBE_IN)
+                       return true;
+               else
+                       return false;
+       }
+
+       protected void addFilesPos(File file) {
+
+               if (!file.exists()) {
+                       System.out.println(file + " does not exist.");
+               }
+               if (file.isDirectory()) {
+                       for (File f : file.listFiles()) {
+                               //if (!(f.getName().endsWith(".txt") || 
f.getName().endsWith(".pdf")))
+                               //      continue;
+                               addFilesPos(f);
+                               System.out.println(f.getName());
+                       }
+               } else {
+                       queuePos.add(file);
+               }
+       }
+       
+       protected void addFilesNeg(File file) {
+
+               if (!file.exists()) {
+                       System.out.println(file + " does not exist.");
+               }
+               if (file.isDirectory()) {
+                       for (File f : file.listFiles()) {
+                               //if 
(!(f.getName().endsWith(".txt")||f.getName().endsWith(".pdf")))
+                               //      continue;
+                               addFilesNeg(f);
+                               System.out.println(f.getName());
+                       }
+               } else {
+                       queueNeg.add(file);
+               }
+       }
+
+       protected void trainClassifier(
+                       String posDirectory, String negDirectory) {
+               
+               queuePos.clear(); queueNeg.clear();
+               addFilesPos(new File(posDirectory));
+               addFilesNeg(new File(negDirectory));
+               
+               List<File> filesPos = new ArrayList<File>(queuePos), filesNeg = 
new ArrayList<File>(queueNeg);
+               
+               List<String[]> treeBankBuffer = new ArrayList<String[]>();
+
+               for (File f : filesPos) {
+                       // get first paragraph of text
+                       String 
text=DescriptiveParagraphFromDocExtractor.getFirstParagraphFromFile(f);         
 
+                       treeBankBuffer.add(new 
String[]{formTreeKernelStructure(text, "1")});           
+               }       
+               for (File f : filesNeg) {
+                       // get first paragraph of text
+                       String 
text=DescriptiveParagraphFromDocExtractor.getFirstParagraphFromFile(f);
+                       treeBankBuffer.add(new 
String[]{formTreeKernelStructure(text, "-1")});          
+               }       
+               
+               // write the lists of samples to a file
+               ProfileReaderWriter.writeReport(treeBankBuffer, 
path+trainingFileName, ' ');
+               // build the model
+               tkRunner.runLearner(path, trainingFileName, modelFileName);
+       }
+
+       public List<String[]> classifyFilesInDirectory(String 
dirFilesToBeClassified){
+               List<String[]> treeBankBuffer = new ArrayList<String[]>();
+               queuePos.clear();
+               addFilesPos(new File( dirFilesToBeClassified));
+               List<File> filesUnkn = new ArrayList<File>(queuePos);
+               for (File f : filesUnkn) {      
+                       String 
text=DescriptiveParagraphFromDocExtractor.getFirstParagraphFromFile(f);
+                       String line = formTreeKernelStructure(text, "0");
+                       treeBankBuffer.add(new String[]{line});         
+               }       
+       
+               // form a file from the texts to be classified
+               ProfileReaderWriter.writeReport(treeBankBuffer, 
path+unknownToBeClassified, ' ');
+               
+               tkRunner.runClassifier(path, unknownToBeClassified, 
modelFileName, classifierOutput);
+               // read classification results
+               List<String[]> classifResults = 
ProfileReaderWriter.readProfiles(path+classifierOutput, ' ');
+               // iterate through classification results and set them as 
scores for hits
+               List<String[]>results = new ArrayList<String[]>();
+               int count=0;
+               for(String[] line: classifResults){
+                       Float val = Float.parseFloat(line[0]);
+                       Boolean in = false;
+                       if (val> MIN_SVM_SCORE_TOBE_IN)
+                               in = true;
+                       
+                       String[] rline = new 
String[]{filesUnkn.get(count).getName(), in.toString(), line[0], 
filesUnkn.get(count).getAbsolutePath() }; // 
treeBankBuffer.get(count).toString() };
+                       results.add(rline);
+                       count++;
+                       
+               }
+               return results;
+
+       }
+
+       protected List<String> 
formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
+               List<String> extendedTreesDumpTotal = new ArrayList<String>();
+               try {
+
+                       for(String text: texts){
+                               // get the parses from original documents, and 
form the training dataset
+                               LOG.info("About to build pt from "+text);
+                               ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(text);
+                               LOG.info("About to build extended forest ");
+                               List<String> extendedTreesDump = 
treeExtender.buildForestForCorefArcs(pt);
+                               for(String line: extendedTreesDump)
+                                       extendedTreesDumpTotal.add(flag + " 
|BT| "+line + " |ET| ");
+                               LOG.info("DONE");
+                       }
+
+               } catch (Exception e) {
+                       LOG.severe("Problem forming  parse thicket flat file to 
be classified\n"+ e.getMessage() );
+               }
+               return extendedTreesDumpTotal;
+       }
+       protected String formTreeKernelStructure(String text, String flag) {
+               String treeBankBuffer = "";
+               try {
+                       // get the parses from original documents, and form the 
training dataset
+                       LOG.info("About to build pt from "+text);
+                       ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(text);
+                       LOG.info("About to build extended forest ");
+                       List<String> extendedTreesDump = 
treeExtender.buildForestForCorefArcs(pt);
+                       LOG.info("DONE");
+
+                       treeBankBuffer+=flag;
+                       // form the list of training samples
+                       for(String t: extendedTreesDump ){
+                               if (BracesProcessor.isBalanced(t))
+                                       treeBankBuffer+=" |BT| "+t;
+                               else
+                                       System.err.println("Wrong tree: " + t);
+                       }
+                       if (extendedTreesDump.size()<1)
+                               treeBankBuffer+=" |BT| ";
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }
+               return treeBankBuffer+ " |ET|";
+       }
+
+       public static void main(String[] args){
+               VerbNetProcessor p = VerbNetProcessor.
+                               
getInstance("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources");
 
+                               
+               TreeKernelBasedClassifier proc = new 
TreeKernelBasedClassifier();
+               
proc.setKernelPath("/Users/borisgalitsky/Documents/tree_kernel/");
+               proc.trainClassifier(args[0], args[1]);
+               List<String[]>res = proc.classifyFilesInDirectory(args[2]);
+               ProfileReaderWriter.writeReport(res, 
"svmDesignDocReport03minus.csv");
+       }
+
+}


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierMultiplePara.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierMultiplePara.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierMultiplePara.java
new file mode 100644
index 0000000..45fb98c
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierMultiplePara.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.kernel_interface;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.apache.commons.io.FileUtils;
+
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;
+import opennlp.tools.parse_thicket.matching.Matcher;
+
+public class TreeKernelBasedClassifierMultiplePara extends 
TreeKernelBasedClassifier{
+       boolean bShortRun = false;
+       public void setShortRun(){
+               bShortRun = true;
+       }
+
+
+       public void trainClassifier(
+                       String posDirectory, String negDirectory) {
+
+               queuePos.clear(); queueNeg.clear();
+               addFilesPos(new File(posDirectory));
+               addFilesNeg(new File(negDirectory));
+
+               List<File> filesPos = new ArrayList<File>(queuePos), filesNeg = 
new ArrayList<File>(queueNeg);
+
+               Collection<String> treeBankBuffer = new ArrayList<String>();
+               int countPos=0, countNeg=0;
+
+               for (File f : filesPos) {
+                       // get first paragraph of text
+                       List<String> 
texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);        
   
+                       List<String> lines = 
formTreeKernelStructuresMultiplePara(texts, "1");
+                       treeBankBuffer.addAll(lines);           
+                       if (bShortRun && countPos>3000)
+                               break;
+
+                       countPos++;
+               }       
+               for (File f : filesNeg) {
+                       // get first paragraph of text 
+                       List<String> 
texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);   
+                       List<String> lines = 
formTreeKernelStructuresMultiplePara(texts, "-1");
+                       treeBankBuffer.addAll(lines);   
+                       if (bShortRun && countNeg>3000)
+                               break;
+
+                       countNeg++;
+               }       
+
+               // write the lists of samples to a file
+               try {
+                       FileUtils.writeLines(new File(path+trainingFileName), 
null, treeBankBuffer);
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               //      ProfileReaderWriter.writeReport(treeBankBuffer, 
path+trainingFileName, ' ');
+               // build the model
+               tkRunner.runLearner(path, trainingFileName, modelFileName);
+       }
+
+       public List<String[]> classifyFilesInDirectory(String 
dirFilesToBeClassified){
+               Map<Integer, Integer> countObject = new HashMap<Integer, 
Integer>(); 
+               int itemCount=0, objectCount = 0;
+               List<String> treeBankBuffer = new ArrayList<String>();
+               queuePos.clear();
+               addFilesPos(new File( dirFilesToBeClassified));
+               List<File> filesUnkn = new ArrayList<File>(queuePos);
+               for (File f : filesUnkn) {      
+                       List<String> 
texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
+                       List<String> lines = 
formTreeKernelStructuresMultiplePara(texts, "0");
+                       for(String l: lines){
+                               countObject.put(itemCount, objectCount);
+                               itemCount++;
+                       }
+                       objectCount++;
+                       treeBankBuffer.addAll(lines);           
+               }       
+               // write the lists of samples to a file
+               try {
+                       FileUtils.writeLines(new 
File(path+unknownToBeClassified), null, treeBankBuffer);
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+
+               tkRunner.runClassifier(path, unknownToBeClassified, 
modelFileName, classifierOutput);
+               // read classification results
+               List<String[]> classifResults = 
ProfileReaderWriter.readProfiles(path+classifierOutput, ' ');
+               // iterate through classification results and set them as 
scores for hits
+               List<String[]>results = new ArrayList<String[]>();
+
+               itemCount=0; objectCount = 0;
+               int currentItemCount=0;
+               float accum = 0;
+               for(String[] line: classifResults){
+                       Float val = Float.parseFloat(line[0]);
+                       accum+=val;
+                       // last line
+                       Boolean bLastLine = false;
+                       if (itemCount==classifResults.size()-1)
+                               bLastLine = true;
+
+                       if (objectCount== countObject .get(itemCount) /*&& 
!bLastLine*/){
+                               itemCount++; 
+                               currentItemCount++;
+                               continue;
+                       }
+                       else while(objectCount!= countObject .get(itemCount)-1){
+                               objectCount++;
+                               String[] rline = new 
String[]{filesUnkn.get(objectCount).getName(), "unknown", "0",
+                                               
filesUnkn.get(objectCount).getAbsolutePath() , new 
Integer(itemCount).toString(), new Integer(objectCount).toString()}; 
+                               results.add(rline);
+                       }
+                       objectCount = countObject.get(itemCount);
+                       itemCount++; 
+
+                       float averaged = accum/(float)currentItemCount;
+                       currentItemCount=0;
+                       Boolean in = false;
+                       if (averaged> MIN_SVM_SCORE_TOBE_IN)
+                               in = true;
+
+                       String[] rline = new 
String[]{filesUnkn.get(objectCount).getName(), in.toString(), new 
Float(averaged).toString(),
+                                       
filesUnkn.get(objectCount).getAbsolutePath() , new 
Integer(itemCount).toString(), new Integer(objectCount).toString()}; 
+                       results.add(rline);
+                       accum=0;
+               }
+               return results;
+
+       }
+
+
+       protected List<String> 
formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
+               List<String> extendedTreesDumpTotal = new ArrayList<String>();
+               try {
+                       for(String text: texts){
+                               // get the parses from original documents, and 
form the training dataset
+                               System.out.println("About to build pt from 
"+text);
+                               ParseThicket pt = 
matcher.buildParseThicketFromTextWithRST(text);
+                               System.out.print("About to build extended 
forest ");
+                               List<String> extendedTreesDump = 
treeExtender.buildForestForCorefArcs(pt);
+                               for(String line: extendedTreesDump)
+                                       extendedTreesDumpTotal.add(flag + " 
|BT| "+line + " |ET| ");
+                               System.out.println("DONE");
+                       }
+
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }
+               return extendedTreesDumpTotal;
+       }
+
+       public static void main(String[] args){
+               VerbNetProcessor p = VerbNetProcessor.
+                               
getInstance("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources");
 
+
+               TreeKernelBasedClassifierMultiplePara proc = new 
TreeKernelBasedClassifierMultiplePara();
+               
proc.setKernelPath("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/tree_kernel/");
+               proc.trainClassifier(
+
+                               
"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/ted",
+                               
"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/Tedi");
+
+               //              List<String[]>res = 
proc.classifyFilesInDirectory(args[2]);
+               //              ProfileReaderWriter.writeReport(res, 
"svmDesignDocReport05plus.csv");
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierOfDiscourseTree.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierOfDiscourseTree.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierOfDiscourseTree.java
new file mode 100644
index 0000000..71e8245
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierOfDiscourseTree.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.kernel_interface;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.parse_thicket.external_rst.MatcherExternalRST;
+import opennlp.tools.parse_thicket.external_rst.ParseThicketWithDiscourseTree;
+
+/*
+ * This class performs TK learning based on parse thicket which includes RST 
relations only 
+ * based on Surdeanu at al RST parser. It does sentence parsing and NLP 
pipeline of 
+ * Surdeanu's wrapper of Stanford NLP
+ */
+public class TreeKernelBasedClassifierOfDiscourseTree extends 
TreeKernelBasedClassifierMultiplePara{
+
+       private MatcherExternalRST matcherRST = new MatcherExternalRST();
+
+       protected List<String> 
formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
+               //TODO
+               //this.setShortRun();   
+               List<String> extendedTreesDumpTotal = new ArrayList<String>();
+               try {
+
+                       for(String text: texts){
+                               // get the parses from original documents, and 
form the training dataset
+                               try {
+                                       System.out.print("About to build pt 
with external rst from "+text + "\n...");
+                                       ParseThicket pt = 
matcherRST.buildParseThicketFromTextWithRST(text);
+                                       if (pt == null)
+                                               continue;
+                                       System.out.print("About to build 
extended forest with external rst...");
+                                       List<String> extendedTreesDump =  // 
use direct option (true
+                                                       
buildReptresentationForDiscourseTreeAndExtensions((ParseThicketWithDiscourseTree)pt,
 false);
+                                                                       //true);
+                                       for(String line: extendedTreesDump)
+                                               extendedTreesDumpTotal.add(flag 
+ " |BT| "+line + " |ET| ");
+                                       System.out.println("DONE");
+                               } catch (Exception e) {
+                                       e.printStackTrace();
+                               }
+                       }
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }
+               return extendedTreesDumpTotal;
+       }
+
+       private List<String> 
buildReptresentationForDiscourseTreeAndExtensions(ParseThicketWithDiscourseTree 
pt, boolean bDirectDT){
+               List<String> extendedTreesDump = new ArrayList<String>();
+               if (!bDirectDT)
+                       // option 1: use RST relation for extended trees 
+                       extendedTreesDump = 
treeExtender.buildForestForRSTArcs(pt);
+               else {
+                       // option 2: use DT directly
+                       extendedTreesDump.add(pt.getDtDump());
+                   extendedTreesDump.add(pt.getDtDumpWithPOS());
+                   extendedTreesDump.add(pt.getDtDumpWithEmbeddedTrees());
+                   extendedTreesDump.add(pt.getDtDumpWithVerbNet());
+               }               
+               return extendedTreesDump;
+       }
+       
+       /*
+        * dtDump
+        * 1 |BT| (elaboration (joint (attribution (I though) (I d tell you a 
little about what I like to write )) (joint (And I like to immerse myself in my 
topics ) (joint (I just like to dive right i) (and become sort of a human 
guinea pig )))) (elaboration (joint (And I see my life as a series of 
experiments ) (joint (So , I work for Esquire magazine ) (elaboration 
(elaboration (and a couple of years ago I wrote an articl) (called My 
Outsourced Life )) (enablement (where I hired a team of people in Bangalore , 
India ) (to live my life for me ))))) (elaboration (So they answered my emails 
) (They answered my phone )))) |ET|
+        * 
+        * getDtDumpWithPOS
+        * 
+        *  1 |BT| (elaboration (joint (attribution (I PRP)(thought VBD) (I 
PRP)(d NN)(tell VBP)(you PRP)(a DT)(little JJ)(about IN)(what WP)(I PRP)(like 
VBP)(to TO)(write VB)) (joint (And CC)(I PRP)(like VBP)(to TO)(immerse 
VB)(myself PRP)(in IN)(my PRP$)(topics NNS) (joint (I PRP)(just RB)(like 
VBP)(to TO)(dive NN)(right NN)(in IN) (and CC)(become VB)(sort NN)(of IN)(a 
DT)(human JJ)(guinea NN)(pig NN)))) (elaboration (joint (And CC)(I PRP)(see 
VBP)(my PRP$)(life NN)(as IN)(a DT)(series NN)(of IN)(experiments NNS) (joint 
(So RB)(I PRP)(work VBP)(for IN)(Esquire NNP)(magazine NN) (elaboration 
(elaboration (and CC)(a DT)(couple NN)(of IN)(years NNS)(ago IN)(I PRP)(wrote 
VBD)(an DT)(article NN) (called VBN)(My PRP$)(Outsourced JJ)(Life NNP)) 
(enablement (where WRB)(I PRP)(hired VBD)(a DT)(team NN)(of IN)(people NNS)(in 
IN)(Bangalore NNP)(India NNP) (to TO)(live VB)(my PRP$)(life NN)(for IN)(me 
PRP))))) (elaboration (So IN)(they PRP)(answered VBD)(my PRP$)(emails NNS) 
(They PRP)(answered VBD)
 (my PRP$)(phone NN)))) |ET| 
+        * 
+        * getDtDumpWithEmbeddedTrees()
+        * 1 |BT| (elaboration (joint (attribution (SBAR (S (NP (PRP I)) (VP 
(ADVP (NN d)) (VBP tell) (NP (PRP you)) (PP (NP (DT a) (JJ little)) (IN about) 
(SBAR (WHNP (WP what)) (S (NP (PRP I)) (VP (VBP like) (S (VP (TO to) (VP (VB 
write))))))))))) (VBP tell)) (joint (VP (VBP like) (S (VP (TO to) (VP (VB 
immerse) (NP (PRP myself)) (PP (IN in) (NP (PRP$ my) (NNS topics))))))) (joint 
(VP (VP (VBP like) (PP (TO to) (NP (NN dive) (NN right))) (PP (IN in))) (CC 
and) (VP (VB become) (NP (NP (NN sort)) (PP (IN of) (NP (DT a) (JJ human) (NN 
guinea) (NN pig)))))) (NP (NP (NN sort)) (PP (IN of) (NP (DT a) (JJ human) (NN 
guinea) (NN pig))))))) (elaboration (joint (VP (VBP see) (NP (PRP$ my) (NN 
life)) (PP (IN as) (NP (NP (DT a) (NN series)) (PP (IN of) (NP (NNS 
experiments)))))) (joint (S (NP (PRP I)) (VP (VBP work) (PP (IN for) (NP (NNP 
Esquire) (NN magazine))))) (elaboration (elaboration (NN couple) (JJ 
Outsourced)) (enablement (VP (VBP work) (PP (IN for) (NP (NNP Esquire) (NN 
magazine)))) (NP (PR
 P$ my) (NN life)))))) (elaboration (VP (VBD answered) (NP (PRP$ my) (NNS 
emails))) (NP (PRP$ my) (NN phone))))) |ET|
+        
+        pt.getDtDumpWithVerbNet()
+        1 |BT| (elaboration (joint (attribution (I PRP)(thought VBD) (I PRP)(d 
NN) (tell  (tell-372 tell-372 tell-372 ) (NP V NP NP V NP PP-topic NP V NP S ) 
(NP NP-PPof-PP NP-S ) ) (you PRP)(a DT)(little JJ)(about IN)(what WP)(I 
PRP)(like VBP)(to TO)(write VB)) (joint (And CC)(I PRP)(like VBP)(to 
TO)(immerse VB)(myself PRP)(in IN)(my PRP$)(topics NNS) (joint (I PRP)(just 
RB)(like VBP)(to TO)(dive NN)(right NN)(in IN) (and CC)(become VB)(sort NN)(of 
IN)(a DT)(human JJ)(guinea NN)(pig NN)))) (elaboration (joint (And CC)(I PRP) 
(see  (see-301 see-301 see-301 ) (NP V NP NP V that S NP V NP-ATTR-POS 
PP-oblique NP V how S NP V what S ) (Basic Transitive S Attribute Object 
Possessor-Attribute Factoring Alternation HOW-S WHAT-S ) ) (my PRP$)(life 
NN)(as IN)(a DT)(series NN)(of IN)(experiments NNS) (joint (So RB)(I PRP)(work 
VBP)(for IN)(Esquire NNP)(magazine NN) (elaboration (elaboration (and CC)(a 
DT)(couple NN)(of IN)(years NNS)(ago IN)(I PRP)(wrote VBD)(an DT)(article NN) 
(call  (dub-293 dub-
 293 dub-293 ) (NP V NP NP NP V NP ) (NP-NP Basic Transitive ) ) (My 
PRP$)(Outsourced JJ)(Life NNP)) (enablement (where WRB)(I PRP) (hire  
(hire-1353 hire-1353 hire-1353 ) (NP V NP NP V NP PP-predicate ) (NP NP-PPas-PP 
) ) (a DT)(team NN)(of IN)(people NNS)(in IN)(Bangalore NNP)(India NNP) (to 
TO)(live VB)(my PRP$)(life NN)(for IN)(me PRP))))) (elaboration (So IN)(they 
PRP)(answered VBD)(my PRP$)(emails NNS) (They PRP)(answered VBD)(my PRP$)(phone 
NN)))) |ET|
+        *
+        */
+       
+       
+       
+       public static void main(String[] args){
+               VerbNetProcessor p = VerbNetProcessor.
+                               
getInstance("/Users/borisgalitsky/Documents/workspace/relevance-based-on-parse-trees/src/test/resources");
 
+
+               TreeKernelBasedClassifierOfDiscourseTree proc = new 
TreeKernelBasedClassifierOfDiscourseTree();
+               
proc.setKernelPath("/Users/borisgalitsky/Documents/workspace/relevance-based-on-parse-trees/src/test/resources/tree_kernel/");
+               proc.trainClassifier(
+//                             
"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/ted",
+//                             
"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/Tedi");
+                               
//"/Users/bgalitsky/Documents/ENRON/enron_random",
+                               
+       //                      "/Users/bgalitsky/Documents/ENRON/data11_17",
+       //      "/Users/bgalitsky/Documents/ENRON/enron_secrecy"
+       //                      
"/Users/bgalitsky/Downloads/op_spam_v1.4/positive_polarity/truthful_from_TripAdvisor",
+                               
"/Users/bgalitsky/Downloads/op_spam_v1.4/negative_polarity/deceptive_from_MTurk",
+                               
"/Users/bgalitsky/Downloads/op_spam_v1.4/negative_polarity/truthful_from_Web" 
+                               );
+                               
+//                             
"/Users/borisgalitsky/Documents/workspace/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/ted",
+//                             
"/Users/borisgalitsky/Documents/workspace/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/Tedi");
+
+       }
+
+}
+/*
+ * 
+RST - based run
+Number of examples: 6980, linear space size: 10
+ted vs Tedi
+
+estimating ...
+Setting default regularization parameter C=1.0000
+Optimizing...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
+ Checking optimality of inactive variables...done.
+ Number of inactive variables = 1931
+done. (3597 iterations)
+Optimization finished (78 misclassified, maxdiff=0.00100).
+Runtime in cpu-seconds: 198.37
+Number of SV: 3830 (including 652 at upper bound)
+L1 loss: loss=261.78883
+Norm of weight vector: |w|=41.37067
+Norm of longest example vector: |x|=1.00000
+Estimated VCdim of classifier: VCdim<=1712.53247
+Computing XiAlpha-estimates...done
+Runtime for XiAlpha-estimates in cpu-seconds: 0.05
+XiAlpha-estimate of the error: error<=11.53% (rho=1.00,depth=0)
+XiAlpha-estimate of the recall: recall=>97.01% (rho=1.00,depth=0)
+XiAlpha-estimate of the precision: precision=>89.47% (rho=1.00,depth=0)
+Number of kernel evaluations: 73092240
+
+GENERAL RUN (the same set of texts)
+Number of examples: 21146, linear space size: 10
+
+estimating ...
+Setting default regularization parameter C=1.0000
+Optimizing...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
+ Checking optimality of inactive variables...done.
+ Number of inactive variables = 8849
+done. (5770 iterations)
+Optimization finished (231 misclassified, maxdiff=0.00098).
+Runtime in cpu-seconds: 1486.33
+Number of SV: 5368 (including 940 at upper bound)
+L1 loss: loss=582.99311
+Norm of weight vector: |w|=46.91885
+Norm of longest example vector: |x|=1.00000
+Estimated VCdim of classifier: VCdim<=2202.37876
+Computing XiAlpha-estimates...done
+Runtime for XiAlpha-estimates in cpu-seconds: 0.13
+XiAlpha-estimate of the error: error<=5.57% (rho=1.00,depth=0)
+XiAlpha-estimate of the recall: recall=>98.42% (rho=1.00,depth=0)
+XiAlpha-estimate of the precision: precision=>95.18% (rho=1.00,depth=0)
+Number of kernel evaluations: 550748695
+Writing model file...done
+
+
+Number of examples: 7461, linear space size: 10
+
+estimating ...
+Setting default regularization parameter C=1.0000
+Optimizing...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
+ Checking optimality of inactive variables...done.
+ Number of inactive variables = 2091
+done. (3773 iterations)
+Optimization finished (87 misclassified, maxdiff=0.00096).
+Runtime in cpu-seconds: 231.42
+Number of SV: 4092 (including 680 at upper bound)
+L1 loss: loss=280.03696
+Norm of weight vector: |w|=42.82963
+Norm of longest example vector: |x|=1.00000
+Estimated VCdim of classifier: VCdim<=1835.37688
+Computing XiAlpha-estimates...done
+Runtime for XiAlpha-estimates in cpu-seconds: 0.05
+XiAlpha-estimate of the error: error<=11.54% (rho=1.00,depth=0)
+XiAlpha-estimate of the recall: recall=>96.75% (rho=1.00,depth=0)
+XiAlpha-estimate of the precision: precision=>89.59% (rho=1.00,depth=0)
+Number of kernel evaluations: 94432306
+Writing model file...done
+
+
+
+SMALL SET
+
+Number of examples: 172, linear space size: 10
+
+estimating ...
+Setting default regularization parameter C=1.0000
+Optimizing.......................................................done. (56 
iterations)
+Optimization finished (0 misclassified, maxdiff=0.00076).
+Runtime in cpu-seconds: 0.01
+Number of SV: 172 (including 59 at upper bound)
+L1 loss: loss=7.38525
+Norm of weight vector: |w|=12.46777
+Norm of longest example vector: |x|=1.00000
+Estimated VCdim of classifier: VCdim<=156.44537
+Computing XiAlpha-estimates...done
+Runtime for XiAlpha-estimates in cpu-seconds: 0.00
+XiAlpha-estimate of the error: error<=44.77% (rho=1.00,depth=0)
+XiAlpha-estimate of the recall: recall=>79.55% (rho=1.00,depth=0)
+XiAlpha-estimate of the precision: precision=>54.26% (rho=1.00,depth=0)
+Number of kernel evaluations: 20139
+Writing model file...done
+
+
+LONGER RUN, DTs only
+Number of examples: 720, linear space size: 10
+
+estimating ...
+Setting default regularization parameter C=1.0000
+Optimizing............................................................................................................................................................................................................................................................................
+ Checking optimality of inactive variables...done.
+ Number of inactive variables = 114
+done. (269 iterations)
+Optimization finished (11 misclassified, maxdiff=0.00096).
+Runtime in cpu-seconds: 0.17
+Number of SV: 712 (including 140 at upper bound)
+L1 loss: loss=117.83422
+Norm of weight vector: |w|=12.73402
+Norm of longest example vector: |x|=1.00000
+Estimated VCdim of classifier: VCdim<=163.15526
+Computing XiAlpha-estimates...done
+Runtime for XiAlpha-estimates in cpu-seconds: 0.00
+XiAlpha-estimate of the error: error<=20.14% (rho=1.00,depth=0)
+XiAlpha-estimate of the recall: recall=>99.14% (rho=1.00,depth=0)
+XiAlpha-estimate of the precision: precision=>80.42% (rho=1.00,depth=0)
+Number of kernel evaluations: 283615
+Writing model file...done
+
+HYBRID RUN
+Number of examples: 8301, linear space size: 10
+
+estimating ...
+Setting default regularization parameter C=1.0000
+Optimizing................................
+ Checking optimality of inactive variables...done.
+ Number of inactive variables = 2323
+done. (4206 iterations)
+Optimization finished (98 misclassified, maxdiff=0.00099).
+Runtime in cpu-seconds: 299.94
+Number of SV: 4870 (including 846 at upper bound)
+L1 loss: loss=398.61389
+Norm of weight vector: |w|=44.95124
+Norm of longest example vector: |x|=1.00000
+Estimated VCdim of classifier: VCdim<=2021.61414
+Computing XiAlpha-estimates...done
+Runtime for XiAlpha-estimates in cpu-seconds: 0.05
+XiAlpha-estimate of the error: error<=12.32% (rho=1.00,depth=0)
+XiAlpha-estimate of the recall: recall=>97.15% (rho=1.00,depth=0)
+XiAlpha-estimate of the precision: precision=>88.53% (rho=1.00,depth=0)
+Number of kernel evaluations: 138447398
+Writing model file...done
+
+HYBRID FULL RUN
+
+Number of examples: 2880, linear space size: 10
+
+estimating ...
+Setting default regularization parameter C=1.0021
+Optimizing...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
+ Checking optimality of inactive variables...done.
+ Number of inactive variables = 1035
+done. (1820 iterations)
+Optimization finished (162 misclassified, maxdiff=0.00099).
+Runtime in cpu-seconds: 1.35
+Number of SV: 1552 (including 556 at upper bound)
+L1 loss: loss=426.90789
+Norm of weight vector: |w|=25.52139
+Norm of longest example vector: |x|=1.00000
+Estimated VCdim of classifier: VCdim<=652.34149
+Computing XiAlpha-estimates...done
+Runtime for XiAlpha-estimates in cpu-seconds: 0.01
+XiAlpha-estimate of the error: error<=23.92% (rho=1.00,depth=0)
+XiAlpha-estimate of the recall: recall=>92.67% (rho=1.00,depth=0)
+XiAlpha-estimate of the precision: precision=>80.55% (rho=1.00,depth=0)
+Number of kernel evaluations: 4075095
+Writing model file...done
+
+
+
+
+
+positive vs negative sentiment
+"/Users/bgalitsky/Downloads/op_spam_v1.4/positive_polarity/truthful_from_TripAdvisor",
+                               
"/Users/bgalitsky/Downloads/op_spam_v1.4/negative_polarity/truthful_from_Web" 
+
+Number of examples: 15930, linear space size: 10
+
+estimating ...
+Setting default regularization parameter C=1.0000
+Optimizing...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
 
............................................................................................................................................................................
+ Checking optimality of inactive variables...done.
+ Number of inactive variables = 4348
+done. (11130 iterations)
+Optimization finished (14 misclassified, maxdiff=0.00098).
+Runtime in cpu-seconds: 2213.21
+Number of SV: 9219 (including 875 at upper bound)
+L1 loss: loss=126.05211
+Norm of weight vector: |w|=71.25103
+Norm of longest example vector: |x|=1.00000
+Estimated VCdim of classifier: VCdim<=5077.70889
+Computing XiAlpha-estimates...done
+Runtime for XiAlpha-estimates in cpu-seconds: 0.09
+XiAlpha-estimate of the error: error<=10.15% (rho=1.00,depth=0)
+XiAlpha-estimate of the recall: recall=>89.36% (rho=1.00,depth=0)
+XiAlpha-estimate of the precision: precision=>89.85% (rho=1.00,depth=0)
+Number of kernel evaluations: 837061668
+Writing model file...done
+ */
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/style_classif/TSNE_ImporterProcessor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/style_classif/TSNE_ImporterProcessor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/style_classif/TSNE_ImporterProcessor.java
new file mode 100644
index 0000000..ef00e94
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/style_classif/TSNE_ImporterProcessor.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.kernel_interface.style_classif;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import 
opennlp.tools.parse_thicket.kernel_interface.TreeKernelBasedClassifierMultiplePara;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+
+public class TSNE_ImporterProcessor {
+       private static String importFilePath = "all-tsne2.txt";
+       public String resourceWorkDir = new 
File(".").getAbsolutePath().replace("/.", "") + 
+                       "/src/test/resources/style_recognizer/";
+
+       public void importFileCreatClassifDirs() {
+               Map<Integer, String> id_Text = new HashMap<Integer, String>();
+               Map<Integer, String> id_Label = new HashMap<Integer, String>();
+
+               try {
+                       FileUtils.cleanDirectory(new 
File(resourceWorkDir+"/txt"));
+               } catch (IOException e2) {
+                       e2.printStackTrace();
+               }
+
+               String text = null;
+               try {
+                       text = FileUtils.readFileToString(new 
File(resourceWorkDir+importFilePath ), Charset.defaultCharset().toString());
+               } catch (IOException e) {
+
+                       e.printStackTrace();
+               }
+
+               String[] portions = StringUtils.substringsBetween(text, "<text 
", "/text>");
+               for(int i=0; i<portions.length; i++){
+                       String label = 
StringUtils.substringBetween(portions[i], "id=\"", "\">");
+                       String po =  StringUtils.substringBetween(portions[i],  
"\">", "<");
+                       id_Text.put(i, po);
+                       id_Label.put(i, label);
+                       if (true){
+                               String localDirName = label.substring(0, 4);
+                               if (!new 
File(resourceWorkDir+"txt/"+localDirName).exists())
+                                       try {
+                                               FileUtils.forceMkdir(new 
File(resourceWorkDir+"txt/"+localDirName));
+                                       } catch (IOException e1) {
+                                               e1.printStackTrace();
+                                       }
+                               try {
+                                       label = label.replace('/', '_');
+                                       String fullPath = 
resourceWorkDir+"txt/"+localDirName+"/"+i+label+".txt";
+                                       FileUtils.writeStringToFile(new 
File(fullPath), po);
+                               } catch (IOException e) {
+                                       e.printStackTrace();
+                               }
+                       }
+               }
+
+       }
+
+       public static void main(String[] args){
+               TSNE_ImporterProcessor thisProc = new TSNE_ImporterProcessor();
+               thisProc.importFileCreatClassifDirs();
+
+               VerbNetProcessor p = VerbNetProcessor.
+                               
getInstance("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources");
 
+
+               TreeKernelBasedClassifierMultiplePara proc = new 
TreeKernelBasedClassifierMultiplePara();
+               
proc.setKernelPath("/Users/borisgalitsky/Documents/tree_kernel/");
+               proc.trainClassifier(thisProc.resourceWorkDir+"/txt/Tele", 
+                               thisProc.resourceWorkDir+"/txt/Tels");
+               //www.sciencedirect.com/science/article/pii/S095070511300138X
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/FrameQueryBasedIExtractor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/FrameQueryBasedIExtractor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/FrameQueryBasedIExtractor.java
new file mode 100644
index 0000000..f75c0b1
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/FrameQueryBasedIExtractor.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.VerbNetProcessor;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+public class FrameQueryBasedIExtractor {
+       List<GeneralizationResult> templates = new 
ArrayList<GeneralizationResult>();
+       Matcher matcher = Matcher.getInstance();
+
+
+
+       private void init() {
+               templates.clear();
+
+       }
+       public void buildPTTemplates(String[] smpls){
+
+               GeneralizationResult templateCurr = 
matcher.assessRelevanceG(smpls[0], smpls[1]);
+               for(int i=2; i<smpls.length; i++){
+
+                       templateCurr = matcher.assessRelevanceG(templateCurr, 
smpls[i]);
+               }
+
+               templates.add(templateCurr);
+               System.out.println("template = "+ templateCurr);
+
+       }
+
+       public void buildTemplates(String[] samples){
+               for(String setOfSamples : samples){
+                       List<String> smpls = 
TextProcessor.splitToSentences(setOfSamples);
+                       if (smpls.size()<2)
+                               continue;
+                       
+                       GeneralizationResult templateCurr = 
matcher.assessRelevanceG(smpls.get(0), smpls.get(1));
+                       for(int i=2; i<smpls.size(); i++){
+
+                               templateCurr = 
matcher.assessRelevanceG(templateCurr, smpls.get(i));
+                       }
+
+                       templates.add(templateCurr);
+                       System.out.println("template = "+ templateCurr+ "\n");
+               }
+       }
+       
+       public void buildTemplatesPairWise(String[] samples){
+               for(String setOfSamples : samples){
+                       List<String> smpls = 
TextProcessor.splitToSentences(setOfSamples);
+
+                       GeneralizationResult templateCurr = null;
+                       for(int i=0; i<smpls.size(); i++)
+                               for(int j=i+1; j< smpls.size(); j++){
+                                       templateCurr = 
matcher.assessRelevanceG(smpls.get(i), smpls.get(j));
+                                       templates.add(templateCurr);
+                                       System.out.println("template = "+ 
templateCurr+ "\n");
+                       }
+               }
+       }
+
+       List<GeneralizationResult>  doIE(String text){
+               List<GeneralizationResult> fires = new 
ArrayList<GeneralizationResult>();
+
+               List<String> sentences = TextProcessor.splitToSentences(text);{
+                       for(String sent: sentences){
+                               for(GeneralizationResult t: templates){
+                                       GeneralizationResult res = 
matcher.assessRelevanceG(t.getGen(), sent);
+                                       boolean fire = 
matcher.isCoveredByTemplate(t.getGen(), res.getGen());
+                                       System.out.println(res+ " => "+ fire + 
"\n");
+                                       if (fire){
+                                               res.setIfFire(fire);
+                                               res.setText(sent);
+                                               fires.add(res);
+                                               
System.out.println("=====================\n TEMPLATE FIRED: "+ sent + 
"\n====================\n");
+                                       }
+                               }
+                       }
+
+               }
+               return fires;
+       }
+
+       List<GeneralizationResult>  doIEforPT(String text){
+               List<GeneralizationResult> fires = new 
ArrayList<GeneralizationResult>();
+
+               for(GeneralizationResult t: templates){
+                       GeneralizationResult res = 
matcher.assessRelevanceG(t.getGen(), text);
+                       boolean fire = matcher.isCoveredByTemplate(t.getGen(), 
res.getGen());
+                       System.out.println(res+ " =PT=> "+ fire + "\n");
+                       res.setIfFire(fire);
+                       res.setText(text);
+                       if (fire)
+                               fires.add(res);
+                       
+               }
+               return fires;
+       }
+
+
+       public static void main(String[] args){
+               
VerbNetProcessor.getInstance("/Users/borisgalitsky/Documents/workspace/opennlp-similarity/src/test/resources");
+               FrameQueryBasedIExtractor extractor = new 
FrameQueryBasedIExtractor();
+               
+               String[] texts = new String[]{"An amusement park sells adult 
tickets for $3 and kids tickets for $2, and got the revenue $500 yesterday.",
+                                               "A certified trainer conducts 
training for adult customers for $30 per hour and kid customer for $20 per 
hour, and got the revenue $1000 today."};              
+               extractor.buildPTTemplates(texts);
+               
+                texts = new String[]{"Crossing the snow slope was dangerous. 
They informed in the blog that an ice axe should be used. However, I am 
reporting that crossing the snow field in the late afternoon I had to use 
crampons.",
+                               "I could not cross the snow creek since it was 
dangerous. This was because the previous hiker reported that ice axe should be 
used in late afternoon.  To inform the fellow hikers, I had to use crampons 
going across the show field in the late afternoon ",
+               };              
+               extractor.buildPTTemplates(texts);
+               List<GeneralizationResult>  res = extractor.doIEforPT( "I had 
to use crampons to cross snow slopes without an ice axe in late afternoon. 
However in summer I do not feel it was dangerous crossing the snow.");
+
+               System.exit(0);
+
+               extractor.buildTemplates(new String[] { ""
+                               + "A junior sale engineer expert travels to 
customers on site. A junior design expert goes to customer companies. "
+                               + "A junior software engineer rushes to 
customer sites. "   
+               });
+               res = extractor.doIE( "Cisco junior sale representative expert 
flew to customers data centers. ");
+
+               extractor.init();
+
+               extractor.buildTemplates(new String[] { "John Doe send his 
California driver license 1234567. "
+                               + "Jill Paparapathi received her Ohio license 
4567456"   });
+
+               res = extractor.doIE( "Mary Jones send her Canada prisoner id 
number 666666666. Mary Stewart hid her Mexico cook id number 666666666 . Robin 
Hood mentioned his UK fisher id  2345."
+                               + "Yesterday Peter Doe hid his Bolivia set id 
number 666666666. Robin mentioned her best Peru fisher man id  2345. Spain hid 
her Catalonian driver id number 666666666. John Poppins hid her  prisoner id  
666666666. "
+                               + "Microsoft announced its Canada windows azure 
release number 666666666. John Poppins hid her Apple id  666666666");
+
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationResult.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationResult.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationResult.java
new file mode 100644
index 0000000..4c8ae6a
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationResult.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.List;
+
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class GeneralizationResult {
+       List<List<ParseTreeChunk>> gen;
+       Boolean bFire;
+       String text;
+
+       public List<List<ParseTreeChunk>> getGen() {
+               return gen;
+       }
+
+       public void setGen(List<List<ParseTreeChunk>> gen) {
+               this.gen = gen;
+       }
+
+       public GeneralizationResult(List<List<ParseTreeChunk>> gen) {
+
+               this.gen = gen;
+       }
+       
+       public String toString(){
+               return this.gen.toString();
+       }
+
+       public void setIfFire(boolean fire) {
+               this.bFire = fire;
+               
+       }
+
+       public void setText(String text2) {
+               this.text = text2;
+               
+       }
+
+       public Boolean getbFire() {
+               return bFire;
+       }
+
+
+
+       public String getText() {
+               return text;
+       }
+       
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
new file mode 100644
index 0000000..de966e5
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaGeneralizer.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.parse_thicket.IGeneralizer;
+import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.word2vec.W2VDistanceMeasurer;
+
+public class LemmaGeneralizer implements IGeneralizer<String> {
+       public static final String w2vPrefix = "w2v_";
+       PStemmer ps = new PStemmer();
+       String pos = null;
+       W2VDistanceMeasurer w2v = null; 
+       public LemmaGeneralizer() {
+               w2v = W2VDistanceMeasurer.getInstance();
+    }
+
+       public void setPOS(String posToSet){
+               this.pos = posToSet;
+       }
+
+       @Override
+       public List<String> generalize(Object o1, Object o2) {
+               List<String> results = new ArrayList<String>();
+               boolean bEqual = false;
+
+               String lemma1 = (String)o1, lemma2 = (String)o2;
+       
+                       
+                       lemma1 = lemma1.toLowerCase();
+                       lemma2 = lemma2.toLowerCase();
+
+                       if (lemma1.equals(lemma2)) {
+                               bEqual = true;
+                               results.add(lemma1);
+                               return results;
+                       }
+
+
+                       if ((lemma1.equals(lemma2 + "s") || 
lemma2.equals(lemma1 + "s"))
+                                       || lemma1.endsWith(lemma2) || 
lemma2.endsWith(lemma1)
+                                       || lemma1.startsWith(lemma2) || 
lemma2.startsWith(lemma1)) {
+                               bEqual = true;
+                               results.add(lemma1);
+                               return results;
+                       }
+
+                       try {
+                               if (ps != null) {
+                                       if (ps.stem(lemma1).toString()
+                                                       
.equalsIgnoreCase(ps.stem(lemma2).toString())) {
+                                               bEqual = true;
+                                               results.add(lemma1);
+                                               return results;
+                                       }
+                               }
+                       } catch (Exception e) {
+                               System.err.println("Problem processing " + 
lemma1 + " " + lemma2);
+                               return results;
+                       }
+                       // if different words, then compute word2vec distance 
and write the value as a string
+                       if (w2v.vec!=null){
+                               double value = w2v.vec.similarity(lemma1,  
lemma2);
+                               results.add(w2vPrefix+new 
Float(value).toString());
+                       }
+                       return results;
+               }
+
+
+
+               
+
+
+
+
+       }

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/MyMatcher.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/MyMatcher.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/MyMatcher.java
new file mode 100644
index 0000000..97ab041
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/MyMatcher.java
@@ -0,0 +1,126 @@
+package opennlp.tools.parse_thicket.matching;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.log4j.Logger;
+import org.deeplearning4j.berkeley.Pair;
+
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.TextProcessor;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.LogManager;
+
+import edu.stanford.nlp.util.StringUtils;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.util.*;
+
+/**
+ * Created by sanviswa on 10/29/16.
+ */
+public class MyMatcher {
+
+    Matcher m = new Matcher();
+
+
+    public static void main(String[] args) throws Exception
+    {
+
+        MyMatcher myMatcher = new MyMatcher();
+        myMatcher.runTest(myMatcher.readFile());
+    }
+
+    public List<String> readFile() throws Exception
+    {
+     //   BufferedReader br = new BufferedReader(new 
FileReader(this.getClass().getResource("/fidelity.txt").getPath()));
+        List<String> al = new ArrayList<String>();
+     /*   String line = null;
+        while ((line = br.readLine()) != null) {
+
+            al.add(line);
+        }
+        br.close(); */
+       String content = FileUtils.readFileToString(new 
File("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/fidelity.txt"));
+        String[] als = content.split("\n");
+       al = Arrays.asList(als);
+       return al;
+    }
+
+    public void runTest(List<String> lst) throws Exception
+    {
+        System.out.println("Enter text: ");
+        Scanner scanner = new Scanner(System.in);
+        String queryStr = scanner.nextLine();
+        if("quit".equals(queryStr))
+        {
+            return;
+        }
+        else
+        {
+            checkLinguisticScores(queryStr,lst);
+            runTest(lst);
+        }
+
+    }
+
+    public void checkLinguisticScores(String q, List<String> aList) throws 
Exception
+    {   // convert query into list of tokens
+       List<String> queryTokens = TextProcessor.fastTokenize(q.toLowerCase(), 
false);
+       
+       List<String> shortListedClasses = new ArrayList<String>();
+       for (String ans: aList) {
+               // convert answer class into the list of tokens
+               List<String> classTokens = 
TextProcessor.fastTokenize(ans.toLowerCase(), false);
+               // do intersection of tokens
+               classTokens.retainAll(queryTokens);
+               int tokenScore = 0;
+               // count significant tokens / no stopwords
+               for(String word: classTokens){
+                       if (word.length()>2 && StringUtils.isAlpha(word))
+                               tokenScore++;
+               }
+               if (tokenScore>1)
+                        shortListedClasses.add(ans);
+       }
+       // do it again with lower thresh, if too few results
+       if (shortListedClasses.size()<5)
+               for (String ans: aList) {
+                       List<String> classTokens = 
TextProcessor.fastTokenize(ans.toLowerCase(), false);
+                       classTokens.retainAll(queryTokens);
+                       int tokenScore = 0;
+                       for(String word: classTokens){
+                               if (word.length()>2 && 
StringUtils.isAlpha(word))
+                                       tokenScore++;
+                       }
+                       if (tokenScore>=1)
+                                shortListedClasses.add(ans);
+               }
+       // if no overlap give up of do the full list 
+       if (shortListedClasses.isEmpty())
+               shortListedClasses = aList;
+               
+        ParseTreeChunkListScorer parseTreeChunkListScorer = new 
ParseTreeChunkListScorer();
+
+        ArrayList<Pair<String,Double>> pairList = new 
ArrayList<Pair<String,Double>>();
+
+        for (String ans: shortListedClasses) {
+
+            List<List<ParseTreeChunk>> res = m.assessRelevanceCache(q, ans);
+            double score1 = 
parseTreeChunkListScorer.getParseTreeChunkListScoreAggregPhraseType(res);
+            Pair<String,Double> p = new Pair<String, Double>(ans, score1);
+            pairList.add(p);
+        }
+
+        Collections.sort(pairList, Comparator.comparing(p -> p.getSecond()));
+
+      System.out.println("***** '" + q + "' ******* falls into the following 
categories: ");
+        for (Pair<String, Double> score: pairList) {
+            System.out.println("        " + score.getFirst() + ": " + 
score.getSecond());
+        }
+
+
+    }
+}
\ No newline at end of file

[47/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

Reply via email to