[43/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

bgalitsky Wed, 16 Nov 2016 01:11:34 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java 
b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java
new file mode 100644
index 0000000..ef1e4b5
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java
@@ -0,0 +1,521 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.stemmer;
+
+
+       import java.io.IOException;
+       import java.io.InputStream;
+       import java.io.FileInputStream;
+
+       import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
+       import org.apache.lucene.util.ArrayUtil;
+
+       /**
+        *
+        * Stemmer, implementing the Porter Stemming Algorithm
+        *
+        * The Stemmer class transforms a word into its root form.  The input
+        * word can be provided a character at time (by calling add()), or at 
once
+        * by calling one of the various stem(something) methods.
+        */
+
+       public class PStemmer
+       {
+         private char[] b;
+         private int i,    /* offset into b */
+           j, k, k0;
+         private boolean dirty = false;
+         private static final int INITIAL_SIZE = 50;
+
+         public PStemmer() {
+           b = new char[INITIAL_SIZE];
+           i = 0;
+         }
+
+         /**
+          * reset() resets the stemmer so it can stem another word.  If you 
invoke
+          * the stemmer by calling add(char) and then stem(), you must call 
reset()
+          * before starting another word.
+          */
+         public void reset() { i = 0; dirty = false; }
+
+         /**
+          * Add a character to the word being stemmed.  When you are finished
+          * adding characters, you can call stem(void) to process the word.
+          */
+         public void add(char ch) {
+           if (b.length <= i) {
+             b = ArrayUtil.grow(b, i+1);
+           }
+           b[i++] = ch;
+         }
+
+         /**
+          * After a word has been stemmed, it can be retrieved by toString(),
+          * or a reference to the internal buffer can be retrieved by 
getResultBuffer
+          * and getResultLength (which is generally more efficient.)
+          */
+         @Override
+         public String toString() { return new String(b,0,i); }
+
+         /**
+          * Returns the length of the word resulting from the stemming process.
+          */
+         public int getResultLength() { return i; }
+
+         /**
+          * Returns a reference to a character buffer containing the results of
+          * the stemming process.  You also need to consult getResultLength()
+          * to determine the length of the result.
+          */
+         public char[] getResultBuffer() { return b; }
+
+         /* cons(i) is true <=> b[i] is a consonant. */
+
+         private final boolean cons(int i) {
+           switch (b[i]) {
+           case 'a': case 'e': case 'i': case 'o': case 'u':
+             return false;
+           case 'y':
+             return (i==k0) ? true : !cons(i-1);
+           default:
+             return true;
+           }
+         }
+
+         /* m() measures the number of consonant sequences between k0 and j. 
if c is
+            a consonant sequence and v a vowel sequence, and <..> indicates 
arbitrary
+            presence,
+
+                 <c><v>       gives 0
+                 <c>vc<v>     gives 1
+                 <c>vcvc<v>   gives 2
+                 <c>vcvcvc<v> gives 3
+                 ....
+         */
+
+         private final int m() {
+           int n = 0;
+           int i = k0;
+           while(true) {
+             if (i > j)
+               return n;
+             if (! cons(i))
+               break;
+             i++;
+           }
+           i++;
+           while(true) {
+             while(true) {
+               if (i > j)
+                 return n;
+               if (cons(i))
+                 break;
+               i++;
+             }
+             i++;
+             n++;
+             while(true) {
+               if (i > j)
+                 return n;
+               if (! cons(i))
+                 break;
+               i++;
+             }
+             i++;
+           }
+         }
+
+         /* vowelinstem() is true <=> k0,...j contains a vowel */
+
+         private final boolean vowelinstem() {
+           int i;
+           for (i = k0; i <= j; i++)
+             if (! cons(i))
+               return true;
+           return false;
+         }
+
+         /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+
+         private final boolean doublec(int j) {
+           if (j < k0+1)
+             return false;
+           if (b[j] != b[j-1])
+             return false;
+           return cons(j);
+         }
+
+         /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - 
consonant
+            and also if the second c is not w,x or y. this is used when trying 
to
+            restore an e at the end of a short word. e.g.
+
+                 cav(e), lov(e), hop(e), crim(e), but
+                 snow, box, tray.
+
+         */
+
+         private final boolean cvc(int i) {
+           if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
+             return false;
+           else {
+             int ch = b[i];
+             if (ch == 'w' || ch == 'x' || ch == 'y') return false;
+           }
+           return true;
+         }
+
+         private final boolean ends(String s) {
+           int l = s.length();
+           int o = k-l+1;
+           if (o < k0)
+             return false;
+           for (int i = 0; i < l; i++)
+             if (b[o+i] != s.charAt(i))
+               return false;
+           j = k-l;
+           return true;
+         }
+
+         /* setto(s) sets (j+1),...k to the characters in the string s, 
readjusting
+            k. */
+
+         void setto(String s) {
+           int l = s.length();
+           int o = j+1;
+           for (int i = 0; i < l; i++)
+             b[o+i] = s.charAt(i);
+           k = j+l;
+           dirty = true;
+         }
+
+         /* r(s) is used further down. */
+
+         void r(String s) { if (m() > 0) setto(s); }
+
+         /* step1() gets rid of plurals and -ed or -ing. e.g.
+
+                  caresses  ->  caress
+                  ponies    ->  poni
+                  ties      ->  ti
+                  caress    ->  caress
+                  cats      ->  cat
+
+                  feed      ->  feed
+                  agreed    ->  agree
+                  disabled  ->  disable
+
+                  matting   ->  mat
+                  mating    ->  mate
+                  meeting   ->  meet
+                  milling   ->  mill
+                  messing   ->  mess
+
+                  meetings  ->  meet
+
+         */
+
+         private final void step1() {
+           if (b[k] == 's') {
+             if (ends("sses")) k -= 2;
+             else if (ends("ies")) setto("i");
+             else if (b[k-1] != 's') k--;
+           }
+           if (ends("eed")) {
+             if (m() > 0)
+               k--;
+           }
+           else if ((ends("ed") || ends("ing")) && vowelinstem()) {
+             k = j;
+             if (ends("at")) setto("ate");
+             else if (ends("bl")) setto("ble");
+             else if (ends("iz")) setto("ize");
+             else if (doublec(k)) {
+               int ch = b[k--];
+               if (ch == 'l' || ch == 's' || ch == 'z')
+                 k++;
+             }
+             else if (m() == 1 && cvc(k))
+               setto("e");
+           }
+         }
+
+         /* step2() turns terminal y to i when there is another vowel in the 
stem. */
+
+         private final void step2() {
+           if (ends("y") && vowelinstem()) {
+             b[k] = 'i';
+             dirty = true;
+           }
+         }
+
+         /* step3() maps double suffices to single ones. so -ization ( = -ize 
plus
+            -ation) maps to -ize etc. note that the string before the suffix 
must give
+            m() > 0. */
+
+         private final void step3() {
+           if (k == k0) return; /* For Bug 1 */
+           switch (b[k-1]) {
+           case 'a':
+             if (ends("ational")) { r("ate"); break; }
+             if (ends("tional")) { r("tion"); break; }
+             break;
+           case 'c':
+             if (ends("enci")) { r("ence"); break; }
+             if (ends("anci")) { r("ance"); break; }
+             break;
+           case 'e':
+             if (ends("izer")) { r("ize"); break; }
+             break;
+           case 'l':
+             if (ends("bli")) { r("ble"); break; }
+             if (ends("alli")) { r("al"); break; }
+             if (ends("entli")) { r("ent"); break; }
+             if (ends("eli")) { r("e"); break; }
+             if (ends("ousli")) { r("ous"); break; }
+             break;
+           case 'o':
+             if (ends("ization")) { r("ize"); break; }
+             if (ends("ation")) { r("ate"); break; }
+             if (ends("ator")) { r("ate"); break; }
+             break;
+           case 's':
+             if (ends("alism")) { r("al"); break; }
+             if (ends("iveness")) { r("ive"); break; }
+             if (ends("fulness")) { r("ful"); break; }
+             if (ends("ousness")) { r("ous"); break; }
+             break;
+           case 't':
+             if (ends("aliti")) { r("al"); break; }
+             if (ends("iviti")) { r("ive"); break; }
+             if (ends("biliti")) { r("ble"); break; }
+             break;
+           case 'g':
+             if (ends("logi")) { r("log"); break; }
+           }
+         }
+
+         /* step4() deals with -ic-, -full, -ness etc. similar strategy to 
step3. */
+
+         private final void step4() {
+           switch (b[k]) {
+           case 'e':
+             if (ends("icate")) { r("ic"); break; }
+             if (ends("ative")) { r(""); break; }
+             if (ends("alize")) { r("al"); break; }
+             break;
+           case 'i':
+             if (ends("iciti")) { r("ic"); break; }
+             break;
+           case 'l':
+             if (ends("ical")) { r("ic"); break; }
+             if (ends("ful")) { r(""); break; }
+             break;
+           case 's':
+             if (ends("ness")) { r(""); break; }
+             break;
+           }
+         }
+
+         /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
+
+         private final void step5() {
+           if (k == k0) return; /* for Bug 1 */
+           switch (b[k-1]) {
+           case 'a':
+             if (ends("al")) break;
+             return;
+           case 'c':
+             if (ends("ance")) break;
+             if (ends("ence")) break;
+             return;
+           case 'e':
+             if (ends("er")) break; return;
+           case 'i':
+             if (ends("ic")) break; return;
+           case 'l':
+             if (ends("able")) break;
+             if (ends("ible")) break; return;
+           case 'n':
+             if (ends("ant")) break;
+             if (ends("ement")) break;
+             if (ends("ment")) break;
+             /* element etc. not stripped before the m */
+             if (ends("ent")) break;
+             return;
+           case 'o':
+             if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
+             /* j >= 0 fixes Bug 2 */
+             if (ends("ou")) break;
+             return;
+             /* takes care of -ous */
+           case 's':
+             if (ends("ism")) break;
+             return;
+           case 't':
+             if (ends("ate")) break;
+             if (ends("iti")) break;
+             return;
+           case 'u':
+             if (ends("ous")) break;
+             return;
+           case 'v':
+             if (ends("ive")) break;
+             return;
+           case 'z':
+             if (ends("ize")) break;
+             return;
+           default:
+             return;
+           }
+           if (m() > 1)
+             k = j;
+         }
+
+         /* step6() removes a final -e if m() > 1. */
+
+         private final void step6() {
+           j = k;
+           if (b[k] == 'e') {
+             int a = m();
+             if (a > 1 || a == 1 && !cvc(k-1))
+               k--;
+           }
+           if (b[k] == 'l' && doublec(k) && m() > 1)
+             k--;
+         }
+
+
+         /**
+          * Stem a word provided as a String.  Returns the result as a String.
+          */
+         public String stem(String s) {
+           if (stem(s.toCharArray(), s.length()))
+             return toString();
+           else
+             return s;
+         }
+
+         /** Stem a word contained in a char[].  Returns true if the stemming 
process
+          * resulted in a word different from the input.  You can retrieve the
+          * result with getResultLength()/getResultBuffer() or toString().
+          */
+         public boolean stem(char[] word) {
+           return stem(word, word.length);
+         }
+
+         /** Stem a word contained in a portion of a char[] array.  Returns
+          * true if the stemming process resulted in a word different from
+          * the input.  You can retrieve the result with
+          * getResultLength()/getResultBuffer() or toString().
+          */
+         public boolean stem(char[] wordBuffer, int offset, int wordLen) {
+           reset();
+           if (b.length < wordLen) {
+             b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
+           }
+           System.arraycopy(wordBuffer, offset, b, 0, wordLen);
+           i = wordLen;
+           return stem(0);
+         }
+
+         /** Stem a word contained in a leading portion of a char[] array.
+          * Returns true if the stemming process resulted in a word different
+          * from the input.  You can retrieve the result with
+          * getResultLength()/getResultBuffer() or toString().
+          */
+         public boolean stem(char[] word, int wordLen) {
+           return stem(word, 0, wordLen);
+         }
+
+         /** Stem the word placed into the Stemmer buffer through calls to 
add().
+          * Returns true if the stemming process resulted in a word different
+          * from the input.  You can retrieve the result with
+          * getResultLength()/getResultBuffer() or toString().
+          */
+         public boolean stem() {
+           return stem(0);
+         }
+
+         public boolean stem(int i0) {
+           k = i - 1;
+           k0 = i0;
+           if (k > k0+1) {
+             step1(); step2(); step3(); step4(); step5(); step6();
+           }
+           // Also, a word is considered dirty if we lopped off letters
+           // Thanks to Ifigenia Vairelles for pointing this out.
+           if (i != k+1)
+             dirty = true;
+           i = k+1;
+           return dirty;
+         }
+
+         /** Test program for demonstrating the Stemmer.  It reads a file and
+          * stems each word, writing the result to standard out.
+          * Usage: Stemmer file-name
+          */
+         public static void main(String[] args) {
+           PStemmer s = new PStemmer();
+
+           for (int i = 0; i < args.length; i++) {
+             try {
+               InputStream in = new FileInputStream(args[i]);
+               byte[] buffer = new byte[1024];
+               int bufferLen, offset, ch;
+
+               bufferLen = in.read(buffer);
+               offset = 0;
+               s.reset();
+
+               while(true) {
+                 if (offset < bufferLen)
+                   ch = buffer[offset++];
+                 else {
+                   bufferLen = in.read(buffer);
+                   offset = 0;
+                   if (bufferLen < 0)
+                     ch = -1;
+                   else
+                     ch = buffer[offset++];
+                 }
+
+                 if (Character.isLetter((char) ch)) {
+                   s.add(Character.toLowerCase((char) ch));
+                 }
+                 else {
+                    s.stem();
+                    System.out.print(s.toString());
+                    s.reset();
+                    if (ch < 0)
+                      break;
+                    else {
+                      System.out.print((char) ch);
+                    }
+                  }
+               }
+
+               in.close();
+             }
+             catch (IOException e) {
+               System.out.println("error reading " + args[i]);
+             }
+           }
+         }
+       }
+


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/word2vec/W2VDistanceMeasurer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/word2vec/W2VDistanceMeasurer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/word2vec/W2VDistanceMeasurer.java
new file mode 100644
index 0000000..f673929
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/word2vec/W2VDistanceMeasurer.java
@@ -0,0 +1,124 @@
+package opennlp.tools.word2vec;
+
+import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+import org.deeplearning4j.models.embeddings.WeightLookupTable;
+import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable;
+import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
+import org.deeplearning4j.models.word2vec.Word2Vec;
+import 
org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache;
+import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
+import org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator;
+import 
org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
+import 
org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
+import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
+import org.springframework.core.io.ClassPathResource;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+
+public class W2VDistanceMeasurer {
+       static W2VDistanceMeasurer instance;
+       public Word2Vec vec = null;
+       private String resourceDir = null;
+
+       public synchronized static W2VDistanceMeasurer getInstance() {
+               if (instance == null)
+                       instance = new W2VDistanceMeasurer();
+               return instance;
+       }
+
+       public W2VDistanceMeasurer(){
+               if (resourceDir ==null)
+                       try {
+                               resourceDir = new File( "." 
).getCanonicalPath()+"/src/test/resources";
+                       } catch (IOException e) {
+                               e.printStackTrace();
+                               vec = null;
+                               return;
+                       }
+       
+               String pathToW2V = resourceDir + 
"/w2v/GoogleNews-vectors-negative300.bin.gz";
+               File gModel = new File(pathToW2V);
+               try {
+                       vec = WordVectorSerializer.loadGoogleModel(gModel, 
true);
+               } catch (IOException e) {
+                       System.out.println("Word2vec model is not loaded");
+                       vec = null;
+                       return;
+               } 
+               
+       } 
+
+       public static void main(String[] args){
+
+               W2VDistanceMeasurer vw2v = W2VDistanceMeasurer.getInstance();
+
+               double value = vw2v.vec.similarity("product", "item");
+               System.out.println(value);
+       }
+
+
+       public static void runCycle() {
+
+               String filePath=null;
+               try {
+                       filePath = new 
ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
+               } catch (IOException e1) {
+                       // TODO Auto-generated catch block
+                       e1.printStackTrace();
+               }
+
+               System.out.println("Load & Vectorize Sentences....");
+               // Strip white space before and after for each line
+               SentenceIterator iter=null;
+               try {
+                       iter = UimaSentenceIterator.createWithPath(filePath);
+               } catch (Exception e1) {
+                       // TODO Auto-generated catch block
+                       e1.printStackTrace();
+               }
+               // Split on white spaces in the line to get words
+               TokenizerFactory t = new DefaultTokenizerFactory();
+               t.setTokenPreProcessor(new CommonPreprocessor());
+
+               InMemoryLookupCache cache = new InMemoryLookupCache();
+               WeightLookupTable table = new InMemoryLookupTable.Builder()
+               .vectorLength(100)
+               .useAdaGrad(false)
+               .cache(cache)
+               .lr(0.025f).build();
+
+               System.out.println("Building model....");
+               Word2Vec vec = new Word2Vec.Builder()
+               .minWordFrequency(5).iterations(1)
+               .layerSize(100).lookupTable(table)
+               .stopWords(new ArrayList<String>())
+               .vocabCache(cache).seed(42)
+               .windowSize(5).iterate(iter).tokenizerFactory(t).build();
+
+               System.out.println("Fitting Word2Vec model....");
+               try {
+                       vec.fit();
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+
+               System.out.println("Writing word vectors to text file....");
+               // Write word
+               try {
+                       WordVectorSerializer.writeWordVectors(vec, 
"pathToWriteto.txt");
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+
+               System.out.println("Closest Words:");
+               Collection<String> lst = vec.wordsNearest("day", 10);
+               System.out.println(lst);
+       }
+}
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/fca/FCATest.java
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/test/java/opennlp/tools/fca/FCATest.java 
b/opennlp-similarity/src/test/java/opennlp/tools/fca/FCATest.java
new file mode 100755
index 0000000..531e6ec
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/fca/FCATest.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.fca;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+
+import junit.framework.TestCase;
+
+public class FCATest extends TestCase{
+       ConceptLattice cl=null;
+/*
+       public void testConceptLattice() {
+
+
+               try {
+                       cl = new 
ConceptLattice("src/test/resources/fca/sports.cxt",true);
+                       cl.printLatticeStats();
+                       cl.printLatticeFull();
+                       cl.printBinContext();
+
+                       FcaWriter wt = new FcaWriter();
+                       wt.WriteStatsToCvs("stats.csv", cl, 0);
+               } catch (FileNotFoundException e) {
+                       e.printStackTrace();
+               } catch (IOException e) {
+
+                       e.printStackTrace();
+               }
+
+               FcaConverter converter = new FcaConverter();
+               int [][] binCon = converter.latticeToContext(cl);
+
+               if (binCon!=null){
+                       ConceptLattice new_cl = new 
ConceptLattice(binCon.length, binCon[0].length, binCon, false);     
+                       new_cl.printLatticeStats();
+                       new_cl.printLatticeFull();
+                       new_cl.printBinContext();
+                       FcaWriter wt = new FcaWriter();
+                       wt.WriteStatsToCvs("stats.txt", cl, 0);
+                       //wt.WriteAsCxt("cl.cxt", cl);
+                       wt.WriteAsCxt("cl_new.cxt", new_cl);
+               }
+       }               
+
+       public void testRandom(){
+               RandomNoiseGenerator rng = new RandomNoiseGenerator();
+               try {
+                       cl = new 
ConceptLattice("src/test/resources/fca/sports.cxt",true);
+               } catch (FileNotFoundException e) {
+                       e.printStackTrace();
+               } catch (IOException e) {
+                       e.printStackTrace();
+               }
+               //int[][] bc = rng.AddObjectsAttributesWithProbability(10, 0.5, 
cl.binaryContext);
+               int[][] bc = rng.AlterCellsWithProbability(0.2, 
cl.binaryContext);
+               ConceptLattice new_cl = new ConceptLattice(bc.length, 
bc[0].length, bc, false); 
+               new_cl.printLatticeStats();
+               new_cl.printLattice();
+       }
+*/
+}
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporterTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporterTest.java
 
b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporterTest.java
new file mode 100644
index 0000000..c2b5877
--- /dev/null
+++ 
b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporterTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.external_rst;
+
+
+import java.util.List;
+
+import junit.framework.TestCase;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
+import opennlp.tools.parse_thicket.matching.Matcher;
+
+public class ExternalRSTImporterTest extends TestCase{
+       
+
+       public void testBuildParseThicketFromTextWithRSTtest(){
+               Matcher m = new Matcher();
+               // We combine our own RST rules with those of Joty 2014 to 
produce an augmented parse thicket
+               String externalRSTresultFilename = "/external_rst/resInput.txt";
+
+               ParseThicket pt = m.buildParseThicketFromTextWithRST("I 
explained that I made a deposit, and then wrote a check, which bounced due to a 
bank error. A customer service representative confirmed that it usually takes a 
day to process the deposit. "
+                               + "I reminded that I was unfairly charged an 
overdraft fee amonth ago in a similar situation. "+
+                               "  They explained that the overdraft fee was 
due to insufficient funds as disclosed in my account information. I disagreed 
with their fee because I made a deposit well in "+
+                               " advance and wanted this fee back. They denied 
responsibility saying that nothing an be done at this point. They also 
confirmed that I needed to look into the account rules closer.");
+               ExternalRSTImporter imp = new ExternalRSTImporter();
+
+               List<WordWordInterSentenceRelationArc> arcsRST = 
imp.buildPT2ptPhrases( pt , externalRSTresultFilename);
+               assertTrue(arcsRST .size() > 10);
+
+
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/JSMLearnerOnLatticeTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/JSMLearnerOnLatticeTest.java
 
b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/JSMLearnerOnLatticeTest.java
new file mode 100644
index 0000000..fd989ba
--- /dev/null
+++ 
b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/JSMLearnerOnLatticeTest.java
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.collections.ListUtils;
+
+import junit.framework.TestCase;
+import opennlp.tools.fca.ConceptLattice;
+import opennlp.tools.fca.FcaWriter;
+import opennlp.tools.fca.FormalConcept;
+import opennlp.tools.similarity.apps.BingWebQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+public class JSMLearnerOnLatticeTest extends TestCase{
+       ParserChunker2MatcherProcessor chunk_maker = 
ParserChunker2MatcherProcessor.getInstance();
+       LinguisticPatternStructure psPos = new LinguisticPatternStructure(0,0), 
psNeg = new LinguisticPatternStructure(0,0);
+       ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic(); 
+
+       public void testJSMLearner() {
+
+               String text1p = "I rent an office space. This office is for my 
business. I can deduct office rental expense from my business profit to 
calculate net income. ";
+               String text2p = "To run my business, I have to rent an office. 
The net business profit is calculated as follows. Rental expense needs to be 
subtracted from revenue. ";
+               String text3p = "To store goods for my retail business I rent 
some space. When I calculate the net income, I take revenue and subtract 
business expenses such as office rent. ";
+               String text4p = "I rent some space for my business. To 
calculate my net income, I subtract from revenue my rental business expense.";
+
+
+               String text1n = "I rent out a first floor unit of my house to a 
travel business. I need to add the rental income to my profit. However, when I 
repair my house, I can deduct the repair expense from my rental income. ";
+               String text2n = "I receive rental income from my office. I have 
to claim it as a profit in my tax forms. I need to add my rental income to my 
profits, but subtract rental expenses such as repair from it. ";
+               String text3n = "I advertised my property as a business rental. 
Advertisement and repair expenses can be subtracted from the rental income. 
Remaining rental income needs to be added to my profit and be reported as 
taxable profit. ";                                
+               String text4n = "I showed  my property to a business owner to 
rent. Expenses on my time spent on advertisement are subtracted from the rental 
income. My rental profits are added to my taxable income.  ";                   
          
+
+               List<List<ParseTreeChunk>> chunks1p = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text1p);
+               List<List<ParseTreeChunk>> chunks2p = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text2p);
+               List<List<ParseTreeChunk>> chunks3p = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text3p);
+               List<List<ParseTreeChunk>> chunks4p = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text4p);
+               List<List<ParseTreeChunk>> chunks1n = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text1n);
+               List<List<ParseTreeChunk>> chunks2n = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text2n);
+               List<List<ParseTreeChunk>> chunks3n = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text3n);
+               List<List<ParseTreeChunk>> chunks4n = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text4n);
+
+
+               LinkedHashSet<Integer> obj = null;
+               obj = new LinkedHashSet<Integer>();
+               obj.add(0);
+               psPos.AddIntent(chunks1p, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(1);
+               psPos.AddIntent(chunks2p, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(2);
+               psPos.AddIntent(chunks3p, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(3);
+               psPos.AddIntent(chunks4p, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(0);
+               psNeg.AddIntent(chunks1n, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(1);
+               psNeg.AddIntent(chunks2n, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(2);
+               psNeg.AddIntent(chunks3n, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(3);
+               psNeg.AddIntent(chunks4n, obj, 0);
+
+               String unknown = "I do not want to rent anything to anyone. I 
just want to rent a space for myself. I neither calculate deduction of 
individual or business tax. I subtract my tax from my income";
+               List<List<ParseTreeChunk>> chunksUnknown = 
chunk_maker.formGroupedPhrasesFromChunksForPara(unknown);
+               List<List<List<ParseTreeChunk>>> posIntersections = new 
ArrayList<List<List<ParseTreeChunk>>>(), 
+                               negIntersections = new 
ArrayList<List<List<ParseTreeChunk>>>();
+               List<List<ParseTreeChunk>> intersection = null;
+               for(int iConcept = 0; iConcept<psPos.conceptList.size(); 
iConcept++){
+                       if (psPos.conceptList.get(iConcept).intent!=null && 
psPos.conceptList.get(iConcept).intent.size()>0){
+                               intersection = md
+                                               
.matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent,
 chunksUnknown);
+                               if (reduceList(intersection).size()>0)
+                                       
posIntersections.add(reduceList(intersection));
+                       }
+                       if (psNeg.conceptList.get(iConcept).intent!=null && 
psNeg.conceptList.get(iConcept).intent.size()>0){                           
+                               intersection = md
+                                               
.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent,
 chunksUnknown);
+                               if (reduceList(intersection).size()>0)
+                                       
negIntersections.add(reduceList(intersection));
+                       }
+               }
+               
+               Pair<List<List<List<ParseTreeChunk>>>, 
List<List<List<ParseTreeChunk>>>> pair = 
+               removeInconsistenciesFromPosNegIntersections( posIntersections, 
+                        negIntersections);
+               
+               posIntersections = pair.getFirst();
+               negIntersections = pair.getSecond();
+
+               List<List<List<ParseTreeChunk>>> posIntersectionsUnderNeg = new 
ArrayList<List<List<ParseTreeChunk>>>(), 
+                               negIntersectionsUnderPos = new 
ArrayList<List<List<ParseTreeChunk>>>();
+
+               for(int iConcept = 0; iConcept<psNeg.conceptList.size(); 
iConcept++){
+                       for(int iConceptJ = 0; 
iConceptJ<negIntersections.size(); iConceptJ++){
+                               intersection = md
+                                               
.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent,
 negIntersections.get(iConceptJ));
+                               if (reduceList(intersection).size()>0)
+                                       
posIntersectionsUnderNeg.add(reduceList(intersection));
+                       }
+               }
+
+               for(int iConcept = 0; iConcept<psPos.conceptList.size(); 
iConcept++){
+                       for(int iConceptJ = 0; 
iConceptJ<posIntersections.size(); iConceptJ++){
+                               intersection = md
+                                               
.matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent,
 posIntersections.get(iConceptJ));
+                               if (reduceList(intersection).size()>0)
+                                       
negIntersectionsUnderPos.add(reduceList(intersection));
+                       }
+               }
+
+               List<ParseTreeChunk>posIntersectionsUnderNegLst = 
flattenParseTreeChunkLst(posIntersectionsUnderNeg);
+               
List<ParseTreeChunk>negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos);
+
+               posIntersectionsUnderNegLst = 
subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst);
+               negIntersectionsUnderPosLst= 
subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst);
+
+               System.out.println("Pos - neg inters = 
"+posIntersectionsUnderNegLst);
+               System.out.println("Neg - pos inters = 
"+negIntersectionsUnderPosLst);
+
+       }
+
+       public List<List<ParseTreeChunk>> reduceList(List<List<ParseTreeChunk>> 
list){
+               float minScore = 1.3f;
+               List<List<ParseTreeChunk>> newList = new 
ArrayList<List<ParseTreeChunk>>();
+
+
+               ParseTreeChunkListScorer scorer = new 
ParseTreeChunkListScorer();
+               for(  List<ParseTreeChunk> group: list){
+                       List<ParseTreeChunk> newGroup = new 
ArrayList<ParseTreeChunk>();
+                       for(ParseTreeChunk ch: group){
+                               if (scorer.getScore(ch) > minScore)
+                                       newGroup.add(ch);
+                       }
+                       if (newGroup.size()>0)
+                               newList.add(newGroup);
+               } 
+
+               return newList;
+
+       }
+
+       public List<List<ParseTreeChunk>> 
flattenParseTreeChunkListList(List<List<List<ParseTreeChunk>>> listOfLists){
+               List<List<ParseTreeChunk>> newList = new 
ArrayList<List<ParseTreeChunk>>();
+
+               for(  List<List<ParseTreeChunk>> member: listOfLists){
+                       Set<ParseTreeChunk> newSet= new 
HashSet<ParseTreeChunk>();
+                       for(  List<ParseTreeChunk> group: member){
+                               if (group.size()>0)
+                                       newSet.addAll(group);
+                       }
+                       newList.add(new ArrayList<ParseTreeChunk>(newSet));
+               }
+
+               return newList;  
+       }
+
+       public List<ParseTreeChunk> 
flattenParseTreeChunkLst(List<List<List<ParseTreeChunk>>> listOfLists){
+               List<ParseTreeChunk> newList = new ArrayList<ParseTreeChunk>();
+               Set<ParseTreeChunk> newSetAll = new HashSet<ParseTreeChunk>();
+
+
+               for(  List<List<ParseTreeChunk>> member: listOfLists){
+                       Set<ParseTreeChunk> newSet= new 
HashSet<ParseTreeChunk>();
+                       for(  List<ParseTreeChunk> group: member){
+                               if (group.size()>0)
+                                       newSet.addAll(group);
+                       }
+                       newSetAll.addAll(newSet);
+               }
+
+               return removeDuplicates(new 
ArrayList<ParseTreeChunk>(newSetAll));  
+       }
+
+       public List<ParseTreeChunk> removeDuplicates(List<ParseTreeChunk> 
dupes){
+               List<Integer> toDelete = new ArrayList<Integer>();
+               for(int i=0; i<dupes.size(); i++)
+                       for(int j=i+1; j<dupes.size(); j++){
+                               if (dupes.get(i).equals(dupes.get(j))){
+                                       toDelete.add(j);
+                               }
+                       }
+               List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
+               for(int i=0; i<dupes.size(); i++){
+                       if (!toDelete.contains(i))
+                               cleaned.add(dupes.get(i));
+               }
+               return cleaned;
+       }
+
+       public List<ParseTreeChunk> subtract(List<ParseTreeChunk> main, 
List<ParseTreeChunk> toSubtract){
+               List<Integer> toDelete = new ArrayList<Integer>();
+               for(int i=0; i<main.size(); i++)
+                       for(int j=0; j<toSubtract.size(); j++){
+                               if (main.get(i).equals(toSubtract.get(j))){
+                                       toDelete.add(i);
+                               }
+                       }
+               List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
+               for(int i=0; i<main.size(); i++){
+                       if (!toDelete.contains(i))
+                               cleaned.add(main.get(i));
+               }
+               return cleaned;
+       }
+       public List<ParseTreeChunk> 
intesectParseTreeChunkLists(List<ParseTreeChunk> a, List<ParseTreeChunk> b){
+               List<Integer> inters = new ArrayList<Integer>();
+               for(int i=0; i<a.size(); i++)
+                       for(int j=0; j<b.size(); j++){
+                               if (a.get(i).equals(b.get(j))){
+                                       inters.add(i);
+                               }
+                       }
+               List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
+               for(int i=0; i<a.size(); i++){
+                       if (inters.contains(i))
+                               cleaned.add(a.get(i));
+               }
+               return cleaned;
+       }
+
+       public Pair<List<List<List<ParseTreeChunk>>>, 
List<List<List<ParseTreeChunk>>>>
+               
removeInconsistenciesFromPosNegIntersections(List<List<List<ParseTreeChunk>>> 
pos, 
+                       List<List<List<ParseTreeChunk>>> neg ){
+
+               List<ParseTreeChunk> posIntersectionsFl = 
flattenParseTreeChunkLst(pos);
+               List<ParseTreeChunk> negIntersectionsFl = 
flattenParseTreeChunkLst(neg);
+
+               List<ParseTreeChunk> intersParseTreeChunkLists = 
intesectParseTreeChunkLists(posIntersectionsFl, negIntersectionsFl);
+
+               List<List<List<ParseTreeChunk>>> cleanedFromInconsPos = new 
ArrayList<List<List<ParseTreeChunk>>>(), 
+                               cleanedFromInconsNeg = new 
ArrayList<List<List<ParseTreeChunk>>>();
+               /*
+               System.out.println("pos = "+ pos);
+               System.out.println("neg = "+ neg);
+               System.out.println("pos flat = "+ posIntersectionsFl);
+               System.out.println("neg flat = "+ negIntersectionsFl);
+               System.out.println("inters = "+  intersParseTreeChunkLists);
+               */
+
+               for(  List<List<ParseTreeChunk>> member: pos){
+                       List<List<ParseTreeChunk>> memberList = new 
ArrayList<List<ParseTreeChunk>>();
+                       for( List<ParseTreeChunk> group: member){
+                               List<ParseTreeChunk> newGroup = new 
ArrayList<ParseTreeChunk>();
+                               for(ParseTreeChunk ch: group){
+                                       boolean bSkip = false;   
+                                       for(ParseTreeChunk check: 
intersParseTreeChunkLists){
+                                               if (check.equals(ch))
+                                                       bSkip=true;
+                                       }
+                                       if (!bSkip)
+                                               newGroup.add(ch);
+                               }
+                               if (newGroup.size()>0)
+                                       memberList.add(newGroup);
+                       } 
+                       if (memberList.size()>0)
+                               cleanedFromInconsPos.add(memberList);
+               }
+               
+               for(  List<List<ParseTreeChunk>> member: neg){
+                       List<List<ParseTreeChunk>> memberList = new 
ArrayList<List<ParseTreeChunk>>();
+                       for( List<ParseTreeChunk> group: member){
+                               List<ParseTreeChunk> newGroup = new 
ArrayList<ParseTreeChunk>();
+                               for(ParseTreeChunk ch: group){
+                                       boolean bSkip = false;   
+                                       for(ParseTreeChunk check: 
intersParseTreeChunkLists){
+                                               if (check.equals(ch))
+                                                       bSkip=true;
+                                       }
+                                       if (!bSkip)
+                                               newGroup.add(ch);
+                               }
+                               if (newGroup.size()>0)
+                                       memberList.add(newGroup);
+                       } 
+                       if (memberList.size()>0)
+                               cleanedFromInconsNeg.add(memberList);
+               }
+
+               return  new Pair(cleanedFromInconsPos, cleanedFromInconsNeg);
+
+       }
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureTest.java
 
b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureTest.java
new file mode 100644
index 0000000..47324a2
--- /dev/null
+++ 
b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureTest.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+import java.util.List;
+
+import junit.framework.TestCase;
+import opennlp.tools.fca.ConceptLattice;
+import opennlp.tools.fca.FcaWriter;
+import opennlp.tools.fca.FormalConcept;
+import opennlp.tools.similarity.apps.BingWebQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+public class PatternStructureTest extends TestCase{
+               ParserChunker2MatcherProcessor chunk_maker = 
ParserChunker2MatcherProcessor.getInstance();
+               LinguisticPatternStructure ps = new 
LinguisticPatternStructure(0,0);
+               BingWebQueryRunner bqr = new BingWebQueryRunner();
+
+       public void test6texts() {
+
+               String text1 = "I rent an office space. This office is for my 
business. I can deduct office rental expense from my business profit to 
calculate net income.";
+               String text2 = "To run my business, I have to rent an office. 
The net business profit is calculated as follows. Rental expense needs to be 
subtracted from revenue.";
+               String text3 = "To store goods for my retail business I rent 
some space. When I calculate the net income, I take revenue and subtract 
business expenses such as office rent.";
+               String text4 = "I rent out a first floor unit of my house to a 
travel business. I need to add the rental income to my profit. However, when I 
repair my house, I can deduct the repair expense from my rental income.";
+               String text5 = "I receive rental income from my office. I have 
to claim it as a profit in my tax forms. I need to add my rental income to my 
profits, but subtract rental expenses such as repair from it.";
+               String text6 = "I advertised my property as a business rental. 
Advertisement and repair expenses can be subtracted from the rental income. 
Remaining rental income needs to be added to my profit and be reported as 
taxable profit. ";                         
+               
+               List<List<ParseTreeChunk>> chunks1 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text1);
+               List<List<ParseTreeChunk>> chunks2 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text2);
+               List<List<ParseTreeChunk>> chunks3 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text3);
+               List<List<ParseTreeChunk>> chunks4 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text4);
+               List<List<ParseTreeChunk>> chunks5 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text5);
+               List<List<ParseTreeChunk>> chunks6 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text6);
+               //ArrayList<ParseTreeChunk> lst = new 
ArrayList<ParseTreeChunk>();
+               
+
+               LinkedHashSet<Integer> obj = null;
+               obj = new LinkedHashSet<Integer>();
+               obj.add(0);
+               ps.AddIntent(chunks1, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(1);
+               ps.AddIntent(chunks2, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(2);
+               ps.AddIntent(chunks3, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(3);
+               ps.AddIntent(chunks4, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(4);
+               ps.AddIntent(chunks5, obj, 0);
+               obj = new LinkedHashSet<Integer>();
+               obj.add(5);
+               ps.AddIntent(chunks6, obj, 0);
+
+               ps.logStability();                              
+               ps.printLatticeExtended();
+
+               int [][] binaryContext = ps.toContext(3);
+               for (int i = 0; i < binaryContext.length; i++ ){
+                       System.out.println(Arrays.toString(binaryContext[i]));
+               }       
+
+               ConceptLattice new_cl = new 
ConceptLattice(binaryContext.length, binaryContext[0].length, 
binaryContext,true);  
+               new_cl.printLatticeStats();
+               new_cl.printLatticeFull();
+               assertEquals(new_cl.getLattice().size(), 7);
+               assertEquals(new_cl.getAttributesCount(), 21);
+               assertEquals(new_cl.getObjectCount(), 3);
+       }
+
+                               // TEST 2 QUERY NEWS
+       public void testQueryNews(){
+                               List<List<ParseTreeChunk>> chunks = null;
+                               BingWebQueryRunner bq = new 
BingWebQueryRunner();
+
+                               String q = "";
+//                             q = "barack obama";
+//                             q = "lady gaga";
+                               q = "angela merkel";
+//                             q = "putin";
+                               ArrayList <HitBase> hb = (ArrayList<HitBase>) 
bq.runSearch(q, 10);
+                               int cnt = 0;
+                               for (HitBase news: hb){
+                                       LinkedHashSet<Integer> obj = null;
+                                       obj = new LinkedHashSet<Integer>();
+                                       obj.add(cnt);
+                                       chunks = 
chunk_maker.formGroupedPhrasesFromChunksForPara(news.getAbstractText());
+                                       System.out.println(chunks);
+                                       ps.AddIntent(chunks,obj, 0);
+                                       cnt++;
+                               }
+
+                               ps.logStability();
+                               System.out.println("LATTICE");
+                               ps.printLatticeExtended();
+
+                               int [][] binaryContext = ps.toContext(cnt);
+                               for (int i = 0; i < binaryContext.length; i++ ){
+                                       
System.out.println(Arrays.toString(binaryContext[i]));
+                               }       
+
+                               ConceptLattice new_cl = new 
ConceptLattice(binaryContext.length, binaryContext[0].length, 
binaryContext,true);  
+                               new_cl.printLatticeStats();
+                               new_cl.printLatticeFull();
+
+                               FcaWriter wr = new FcaWriter();
+                               wr.WriteAsCxt("res.cxt", new_cl);
+
+                               System.out.println("Extent PS 
"+ps.conceptList.size());
+                               //for (int i = 0; i<ps.conceptList.size();i++){
+                               //      
System.out.println(ps.conceptList.get(i).extent);
+                               //}
+                               System.out.println("Extent CL 
"+new_cl.getLattice().size());
+                               //for (int i = 0; 
i<new_cl.getLattice().size();i++){
+                               //      
System.out.println(new_cl.getLattice().get(i).getExtent());
+                               //}
+       }
+       public void testNews(){
+               List<List<ParseTreeChunk>> chunks = null;
+
+                               ArrayList <HitBase>  result = 
(ArrayList<HitBase>) bqr.runSearch("site:http://news.yahoo.com "  + "merkel", 
10);
+                               System.out.println(" ResultSize  " + 
result.size());
+                               int ind = -1;
+                               String text_result = "";
+                               for (int i = 0; i < result.size(); i++ ){
+                                       
System.out.println(result.get(i).getAbstractText());
+                                       ind = 
result.get(i).getAbstractText().indexOf(") -");
+                                       if (ind < 0)
+                                               ind = 
result.get(i)//.getDescription()
+                                                               
.getAbstractText().indexOf(") ï¿½");
+                                       if (ind > 0)
+                                               text_result = 
result.get(i)//.getDescription()
+                                                               
.getAbstractText().substring(ind + 3);
+                                       else 
+                                               text_result = 
result.get(i)//.getDescription()
+                                                               
.getAbstractText();
+
+                                       LinkedHashSet<Integer> obj = null;
+                                       obj = new LinkedHashSet<Integer>();
+                                       obj.add(i);
+                                       chunks = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text_result);
+                                       ps.AddIntent(chunks,obj, 0);
+                               }
+
+                               ps.logStability();
+                               ps.printLatticeExtended();
+
+                               int [][] binaryContext = 
ps.toContext(result.size());
+
+                               ConceptLattice new_cl = new 
ConceptLattice(binaryContext.length, binaryContext[0].length, 
binaryContext,true);  
+
+                               FcaWriter wt = new FcaWriter();
+                               wt.WriteStatsToTxt("merkel_stats.txt", new_cl, 
0);
+                               wt.WriteStatsToCvs("merkel_stats.csv", new_cl, 
ps.conceptList.size());
+                               wt.WriteAsCxt("merkel_lattice.cxt", new_cl);
+
+                               PatternStructureWriter pswt = new 
PatternStructureWriter();
+                               pswt.WriteStatsToTxt("ps_res.txt", ps);
+
+                               System.out.println("Extent PS 
"+ps.conceptList.size());
+                               System.out.println("Extent CL 
"+new_cl.getLattice().size());
+       }
+
+
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhraseTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhraseTest.java
 
b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhraseTest.java
new file mode 100755
index 0000000..58246e1
--- /dev/null
+++ 
b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhraseTest.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+import java.util.List;
+
+import opennlp.tools.fca.ConceptLattice;
+import opennlp.tools.fca.FcaWriter;
+import opennlp.tools.fca.FormalConcept;
+import opennlp.tools.similarity.apps.BingWebQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+public class PhraseTest {
+       
+       
+       public static void main(String []args) {
+               
+/*//TEST 1             
+                               String text1 = "Iran refuses to accept the UN 
proposal to end its dispute over its work on nuclear weapons."+"UN nuclear 
watchdog passes a resolution condemning Iran for developing its second uranium 
enrichment site in secret. " +
+                                 "A recent IAEA report presented diagrams that 
suggested Iran was secretly working on nuclear weapons. " + 
+                                 "Iran envoy says its nuclear development is 
for peaceful purpose, and the material evidence against it has been fabricated 
by the US. ";
+                               String text2 = "However, several experts 
familiar with the inspections believe that Iraq could also probably have 
produced a workable device in as little as 6 to 24 months, had they decided to 
seize foreign-supplied HEU from under safeguards and focus their efforts on a 
crash program to produce a device in the shortest possible amount of time.";
+                               String text3 ="Iraq invested significant 
resources into uranium enrichment through laser isotope separation (LIS) 
involving both molecular (MLIS) and atomic vapor (AVLIS) technologies, 
including a number of activities with respect to laser component manufacture, 
particularly CO2 lasers and the manufacture of components for use in 
laser-related experimentation. The Laser Section within the Physics Department 
of the IAEC at Tuwaitha received an objective in 1981 from the IAEC to work in 
Laser Isotope Separation. It started in two lines; one which was looking after 
the molecular and the other the atomic vapor direction.";
+                               ParserChunker2MatcherProcessor chunk_maker = 
ParserChunker2MatcherProcessor.getInstance();
+                               List<List<ParseTreeChunk>> chunks1 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text1);
+                               List<List<ParseTreeChunk>> chunks2 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text2);
+                               List<List<ParseTreeChunk>> chunks3 = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text3);
+                               ArrayList<ParseTreeChunk> lst = new 
ArrayList<ParseTreeChunk>();
+                               PhrasePatternStructureExtended ps = new 
PhrasePatternStructureExtended(0,0);
+                               LinkedHashSet<Integer> obj = null;
+                               obj = new LinkedHashSet<Integer>();
+                               obj.add(0);
+                               ps.AddIntent(chunks1, obj, 0);
+                               obj = new LinkedHashSet<Integer>();
+                               obj.add(1);
+                               ps.AddIntent(chunks2, obj, 0);
+                               obj = new LinkedHashSet<Integer>();
+                               obj.add(2);
+                               ps.AddIntent(chunks3, obj, 0);
+                               ps.logStability();                              
+                               ps.printLatticeExtended();
+                               
+                               int [][] binaryContext = ps.toContext(3);
+                               for (int i = 0; i < binaryContext.length; i++ ){
+                                       
System.out.println(Arrays.toString(binaryContext[i]));
+                               }       
+                               
+                               ConceptLattice new_cl = new 
ConceptLattice(binaryContext.length, binaryContext[0].length, 
binaryContext,true);  
+                               new_cl.printLatticeStats();
+                               new_cl.printLatticeFull();
+*/
+               
+/*                             // TEST 2 QUERY NEWS
+                               ParserChunker2MatcherProcessor chunk_maker = 
ParserChunker2MatcherProcessor.getInstance();
+                               List<List<ParseTreeChunk>> chunks = null;
+                               BingWebQueryRunner bq = new 
BingWebQueryRunner();
+                               
+                               String q = "";
+//                             q = "barack obama";
+//                             q = "lady gaga";
+                               q = "angela merkel";
+//                             q = "putin";
+                               ArrayList <HitBase> hb = (ArrayList<HitBase>) 
bq.runSearch(q, 10);
+                               PhrasePatternStructureExtended ps = new 
PhrasePatternStructureExtended(0,0);
+                               int cnt = 0;
+                               for (HitBase news: hb){
+                                       LinkedHashSet<Integer> obj = null;
+                                       obj = new LinkedHashSet<Integer>();
+                                       obj.add(cnt);
+                                       chunks = 
chunk_maker.formGroupedPhrasesFromChunksForPara(news.getDescription());
+                                       System.out.println(chunks);
+                                       ps.AddIntent(chunks,obj, 0);
+                                       cnt++;
+                               }
+                               
+                               ps.logStability();
+                               System.out.println("LATTICE");
+                               ps.printLatticeExtended();
+                               
+                               int [][] binaryContext = ps.toContext(cnt);
+                               for (int i = 0; i < binaryContext.length; i++ ){
+                                       
System.out.println(Arrays.toString(binaryContext[i]));
+                               }       
+                               
+                               ConceptLattice new_cl = new 
ConceptLattice(binaryContext.length, binaryContext[0].length, 
binaryContext,true);  
+                               new_cl.printLatticeStats();
+                               new_cl.printLatticeFull();
+                               
+                               FcaWriter wr = new FcaWriter();
+                               wr.WriteAsCxt("res.cxt", new_cl);
+                               
+                               System.out.println("Extent PS 
"+ps.conceptList.size());
+                               //for (int i = 0; i<ps.conceptList.size();i++){
+                               //      
System.out.println(ps.conceptList.get(i).extent);
+                               //}
+                               System.out.println("Extent CL 
"+new_cl.getLattice().size());
+                               //for (int i = 0; 
i<new_cl.getLattice().size();i++){
+                               //      
System.out.println(new_cl.getLattice().get(i).getExtent());
+                               //}
+*/                             
+                               LinguisticPatternStructure ps = new 
LinguisticPatternStructure(0,0);
+                               ParserChunker2MatcherProcessor chunk_maker = 
ParserChunker2MatcherProcessor.getInstance();
+                               List<List<ParseTreeChunk>> chunks = null;       
        
+                               BingWebQueryRunner bqr = new 
BingWebQueryRunner();
+                               ArrayList <HitBase>  result = 
(ArrayList<HitBase>) bqr.runSearch("site:http://news.yahoo.com "  + "merkel", 
10);
+                               System.out.println(" ResultSize  " + 
result.size());
+                               int ind = -1;
+                               String text_result = "";
+                               for (int i = 0; i < result.size(); i++ ){
+                                       
System.out.println(result.get(i).getAbstractText());
+                                       ind = 
result.get(i).getAbstractText().indexOf(") -");
+                                       if (ind < 0)
+                                               ind = 
result.get(i)//.getDescription()
+                                                               
.getAbstractText().indexOf(") ï¿½");
+                                       if (ind > 0)
+                                               text_result = 
result.get(i)//.getDescription()
+                                                               
.getAbstractText().substring(ind + 3);
+                                       else 
+                                               text_result = 
result.get(i)//.getDescription()
+                                                               
.getAbstractText();
+                                       
+                                       LinkedHashSet<Integer> obj = null;
+                                       obj = new LinkedHashSet<Integer>();
+                                       obj.add(i);
+                                       chunks = 
chunk_maker.formGroupedPhrasesFromChunksForPara(text_result);
+                                       ps.AddIntent(chunks,obj, 0);
+                               }
+                               
+                               ps.logStability();
+                               ps.printLatticeExtended();
+                               
+                               int [][] binaryContext = 
ps.toContext(result.size());
+
+                               ConceptLattice new_cl = new 
ConceptLattice(binaryContext.length, binaryContext[0].length, 
binaryContext,true);  
+       
+                               FcaWriter wt = new FcaWriter();
+                               wt.WriteStatsToTxt("merkel_stats.txt", new_cl, 
0);
+                               wt.WriteStatsToCvs("merkel_stats.csv", new_cl, 
ps.conceptList.size());
+                               wt.WriteAsCxt("merkel_lattice.cxt", new_cl);
+                               
+                               PatternStructureWriter pswt = new 
PatternStructureWriter();
+                               pswt.WriteStatsToTxt("ps_res.txt", ps);
+                               
+                               System.out.println("Extent PS 
"+ps.conceptList.size());
+                               System.out.println("Extent CL 
"+new_cl.getLattice().size());
+
+                               
+                               }       
+       
+                                                                       
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/resources/external_rst/resInput.txt
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/test/resources/external_rst/resInput.txt 
b/opennlp-similarity/src/test/resources/external_rst/resInput.txt
new file mode 100644
index 0000000..995b706
--- /dev/null
+++ b/opennlp-similarity/src/test/resources/external_rst/resInput.txt
@@ -0,0 +1,62 @@
+( Root (span 1 21)
+  ( Nucleus (span 1 5) (rel2par span)
+    ( Nucleus (span 1 3) (rel2par Joint)
+      ( Satellite (leaf 1) (rel2par Attribution) (text _!I explained_!) )
+      ( Nucleus (span 2 3) (rel2par span)
+        ( Nucleus (leaf 2) (rel2par span) (text _!that_!) )
+        ( Satellite (leaf 3) (rel2par Attribution) (text _!I made a deposit 
,_!) )
+       )
+     )
+    ( Nucleus (span 4 5) (rel2par Joint)
+      ( Nucleus (leaf 4) (rel2par span) (text _!and then wrote a check ,_!) )
+      ( Satellite (leaf 5) (rel2par Elaboration) (text _!which bounced due to 
a bank error ._!) )
+     )
+  )
+  ( Satellite (span 6 21) (rel2par Summary)
+    ( Nucleus (span 6 8) (rel2par span)
+      ( Satellite (leaf 6) (rel2par Attribution) (text _!A customer service 
representative confirmed_!) )
+      ( Nucleus (span 7 8) (rel2par span)
+        ( Nucleus (leaf 7) (rel2par span) (text _!that it usually takes a 
day_!) )
+        ( Satellite (leaf 8) (rel2par Enablement) (text _!to process the 
deposit ._!) )
+       )
+    )
+    ( Satellite (span 9 21) (rel2par Elaboration)
+      ( Nucleus (span 9 16) (rel2par span)
+        ( Nucleus (span 9 13) (rel2par span)
+          ( Nucleus (span 9 10) (rel2par span)
+            ( Satellite (leaf 9) (rel2par Attribution) (text _!I reminded_!) )
+            ( Nucleus (leaf 10) (rel2par span) (text _!that I was unfairly 
charged an overdraft fee a month ago in a similar situation ._!) )
+          )
+          ( Satellite (span 11 13) (rel2par Elaboration)
+            ( Satellite (leaf 11) (rel2par Attribution) (text _!They 
explained_!) )
+            ( Nucleus (span 12 13) (rel2par span)
+              ( Nucleus (leaf 12) (rel2par span) (text _!that the overdraft 
fee was due to insufficient funds_!) )
+              ( Satellite (leaf 13) (rel2par Comparison) (text _!as disclosed 
in my account information ._!) )
+             )
+          )
+        )
+        ( Satellite (span 14 16) (rel2par Elaboration)
+          ( Nucleus (leaf 14) (rel2par span) (text _!I disagreed with their 
fee_!) )
+          ( Satellite (span 15 16) (rel2par Explanation)
+            ( Nucleus (leaf 15) (rel2par Joint) (text _!because I made a 
deposit well in advance_!) )
+            ( Nucleus (leaf 16) (rel2par Joint) (text _!and wanted this fee 
back ._!) )
+           )
+        )
+      )
+      ( Satellite (span 17 21) (rel2par Topic-Comment)
+        ( Nucleus (span 17 19) (rel2par Joint)
+          ( Nucleus (leaf 17) (rel2par span) (text _!They denied 
responsibility_!) )
+          ( Satellite (span 18 19) (rel2par Elaboration)
+            ( Satellite (leaf 18) (rel2par Attribution) (text _!saying_!) )
+            ( Nucleus (leaf 19) (rel2par span) (text _!that nothing can be 
done at this point ._!) )
+           )
+        )
+        ( Nucleus (span 20 21) (rel2par Joint)
+          ( Satellite (leaf 20) (rel2par Attribution) (text _!They also 
confirmed_!) )
+          ( Nucleus (leaf 21) (rel2par span) (text _!that I needed to look 
into the account rules closer ._!) )
+        )
+      )
+    )
+  )
+)
+

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/resources/fca/sports.cxt
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/test/resources/fca/sports.cxt 
b/opennlp-similarity/src/test/resources/fca/sports.cxt
new file mode 100755
index 0000000..038dc49
--- /dev/null
+++ b/opennlp-similarity/src/test/resources/fca/sports.cxt
@@ -0,0 +1,55 @@
+B
+
+20
+10
+
+Obj 1
+Obj 2
+Obj 3
+Obj 4
+Obj 5
+Obj 6
+Obj 7
+Obj 8
+Obj 9
+Obj 10
+Obj 11
+Obj 12
+Obj 13
+Obj 14
+Obj 15
+Obj 16
+Obj 17
+Obj 18
+Obj 19
+Obj 20
+Attr 1
+Attr 2
+Attr 3
+Attr 4
+Attr 5
+Attr 6
+Attr 7
+Attr 8
+Attr 9
+Attr 10
+X...X....X
+X...X....X
+X...X..XX.
+X.X.X..X.X
+X..X.XX.X.
+X..X.XX.X.
+X..X.XX.X.
+X..X.XX.X.
+.X.X..X.X.
+.X.X....X.
+.X..X....X
+.X.X....X.
+.X..X....X
+.X..X..X.X
+.X..X....X
+..XX...XX.
+..X.X...X.
+..XX.XX.X.
+..X.X...X.
+..XX.....X

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/resources/new_vn.zip
----------------------------------------------------------------------
diff --git a/opennlp-similarity/src/test/resources/new_vn.zip 
b/opennlp-similarity/src/test/resources/new_vn.zip
new file mode 100644
index 0000000..cf0b9bc
Binary files /dev/null and b/opennlp-similarity/src/test/resources/new_vn.zip 
differ

[43/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

Reply via email to