http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java new file mode 100644 index 0000000..ef1e4b5 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PStemmer.java @@ -0,0 +1,521 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.stemmer; + + + import java.io.IOException; + import java.io.InputStream; + import java.io.FileInputStream; + + import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR; + import org.apache.lucene.util.ArrayUtil; + + /** + * + * Stemmer, implementing the Porter Stemming Algorithm + * + * The Stemmer class transforms a word into its root form. The input + * word can be provided a character at time (by calling add()), or at once + * by calling one of the various stem(something) methods. + */ + + public class PStemmer + { + private char[] b; + private int i, /* offset into b */ + j, k, k0; + private boolean dirty = false; + private static final int INITIAL_SIZE = 50; + + public PStemmer() { + b = new char[INITIAL_SIZE]; + i = 0; + } + + /** + * reset() resets the stemmer so it can stem another word. If you invoke + * the stemmer by calling add(char) and then stem(), you must call reset() + * before starting another word. + */ + public void reset() { i = 0; dirty = false; } + + /** + * Add a character to the word being stemmed. When you are finished + * adding characters, you can call stem(void) to process the word. + */ + public void add(char ch) { + if (b.length <= i) { + b = ArrayUtil.grow(b, i+1); + } + b[i++] = ch; + } + + /** + * After a word has been stemmed, it can be retrieved by toString(), + * or a reference to the internal buffer can be retrieved by getResultBuffer + * and getResultLength (which is generally more efficient.) + */ + @Override + public String toString() { return new String(b,0,i); } + + /** + * Returns the length of the word resulting from the stemming process. + */ + public int getResultLength() { return i; } + + /** + * Returns a reference to a character buffer containing the results of + * the stemming process. You also need to consult getResultLength() + * to determine the length of the result. + */ + public char[] getResultBuffer() { return b; } + + /* cons(i) is true <=> b[i] is a consonant. */ + + private final boolean cons(int i) { + switch (b[i]) { + case 'a': case 'e': case 'i': case 'o': case 'u': + return false; + case 'y': + return (i==k0) ? true : !cons(i-1); + default: + return true; + } + } + + /* m() measures the number of consonant sequences between k0 and j. if c is + a consonant sequence and v a vowel sequence, and <..> indicates arbitrary + presence, + + <c><v> gives 0 + <c>vc<v> gives 1 + <c>vcvc<v> gives 2 + <c>vcvcvc<v> gives 3 + .... + */ + + private final int m() { + int n = 0; + int i = k0; + while(true) { + if (i > j) + return n; + if (! cons(i)) + break; + i++; + } + i++; + while(true) { + while(true) { + if (i > j) + return n; + if (cons(i)) + break; + i++; + } + i++; + n++; + while(true) { + if (i > j) + return n; + if (! cons(i)) + break; + i++; + } + i++; + } + } + + /* vowelinstem() is true <=> k0,...j contains a vowel */ + + private final boolean vowelinstem() { + int i; + for (i = k0; i <= j; i++) + if (! cons(i)) + return true; + return false; + } + + /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ + + private final boolean doublec(int j) { + if (j < k0+1) + return false; + if (b[j] != b[j-1]) + return false; + return cons(j); + } + + /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short word. e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + + */ + + private final boolean cvc(int i) { + if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) + return false; + else { + int ch = b[i]; + if (ch == 'w' || ch == 'x' || ch == 'y') return false; + } + return true; + } + + private final boolean ends(String s) { + int l = s.length(); + int o = k-l+1; + if (o < k0) + return false; + for (int i = 0; i < l; i++) + if (b[o+i] != s.charAt(i)) + return false; + j = k-l; + return true; + } + + /* setto(s) sets (j+1),...k to the characters in the string s, readjusting + k. */ + + void setto(String s) { + int l = s.length(); + int o = j+1; + for (int i = 0; i < l; i++) + b[o+i] = s.charAt(i); + k = j+l; + dirty = true; + } + + /* r(s) is used further down. */ + + void r(String s) { if (m() > 0) setto(s); } + + /* step1() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + + */ + + private final void step1() { + if (b[k] == 's') { + if (ends("sses")) k -= 2; + else if (ends("ies")) setto("i"); + else if (b[k-1] != 's') k--; + } + if (ends("eed")) { + if (m() > 0) + k--; + } + else if ((ends("ed") || ends("ing")) && vowelinstem()) { + k = j; + if (ends("at")) setto("ate"); + else if (ends("bl")) setto("ble"); + else if (ends("iz")) setto("ize"); + else if (doublec(k)) { + int ch = b[k--]; + if (ch == 'l' || ch == 's' || ch == 'z') + k++; + } + else if (m() == 1 && cvc(k)) + setto("e"); + } + } + + /* step2() turns terminal y to i when there is another vowel in the stem. */ + + private final void step2() { + if (ends("y") && vowelinstem()) { + b[k] = 'i'; + dirty = true; + } + } + + /* step3() maps double suffices to single ones. so -ization ( = -ize plus + -ation) maps to -ize etc. note that the string before the suffix must give + m() > 0. */ + + private final void step3() { + if (k == k0) return; /* For Bug 1 */ + switch (b[k-1]) { + case 'a': + if (ends("ational")) { r("ate"); break; } + if (ends("tional")) { r("tion"); break; } + break; + case 'c': + if (ends("enci")) { r("ence"); break; } + if (ends("anci")) { r("ance"); break; } + break; + case 'e': + if (ends("izer")) { r("ize"); break; } + break; + case 'l': + if (ends("bli")) { r("ble"); break; } + if (ends("alli")) { r("al"); break; } + if (ends("entli")) { r("ent"); break; } + if (ends("eli")) { r("e"); break; } + if (ends("ousli")) { r("ous"); break; } + break; + case 'o': + if (ends("ization")) { r("ize"); break; } + if (ends("ation")) { r("ate"); break; } + if (ends("ator")) { r("ate"); break; } + break; + case 's': + if (ends("alism")) { r("al"); break; } + if (ends("iveness")) { r("ive"); break; } + if (ends("fulness")) { r("ful"); break; } + if (ends("ousness")) { r("ous"); break; } + break; + case 't': + if (ends("aliti")) { r("al"); break; } + if (ends("iviti")) { r("ive"); break; } + if (ends("biliti")) { r("ble"); break; } + break; + case 'g': + if (ends("logi")) { r("log"); break; } + } + } + + /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ + + private final void step4() { + switch (b[k]) { + case 'e': + if (ends("icate")) { r("ic"); break; } + if (ends("ative")) { r(""); break; } + if (ends("alize")) { r("al"); break; } + break; + case 'i': + if (ends("iciti")) { r("ic"); break; } + break; + case 'l': + if (ends("ical")) { r("ic"); break; } + if (ends("ful")) { r(""); break; } + break; + case 's': + if (ends("ness")) { r(""); break; } + break; + } + } + + /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ + + private final void step5() { + if (k == k0) return; /* for Bug 1 */ + switch (b[k-1]) { + case 'a': + if (ends("al")) break; + return; + case 'c': + if (ends("ance")) break; + if (ends("ence")) break; + return; + case 'e': + if (ends("er")) break; return; + case 'i': + if (ends("ic")) break; return; + case 'l': + if (ends("able")) break; + if (ends("ible")) break; return; + case 'n': + if (ends("ant")) break; + if (ends("ement")) break; + if (ends("ment")) break; + /* element etc. not stripped before the m */ + if (ends("ent")) break; + return; + case 'o': + if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; + /* j >= 0 fixes Bug 2 */ + if (ends("ou")) break; + return; + /* takes care of -ous */ + case 's': + if (ends("ism")) break; + return; + case 't': + if (ends("ate")) break; + if (ends("iti")) break; + return; + case 'u': + if (ends("ous")) break; + return; + case 'v': + if (ends("ive")) break; + return; + case 'z': + if (ends("ize")) break; + return; + default: + return; + } + if (m() > 1) + k = j; + } + + /* step6() removes a final -e if m() > 1. */ + + private final void step6() { + j = k; + if (b[k] == 'e') { + int a = m(); + if (a > 1 || a == 1 && !cvc(k-1)) + k--; + } + if (b[k] == 'l' && doublec(k) && m() > 1) + k--; + } + + + /** + * Stem a word provided as a String. Returns the result as a String. + */ + public String stem(String s) { + if (stem(s.toCharArray(), s.length())) + return toString(); + else + return s; + } + + /** Stem a word contained in a char[]. Returns true if the stemming process + * resulted in a word different from the input. You can retrieve the + * result with getResultLength()/getResultBuffer() or toString(). + */ + public boolean stem(char[] word) { + return stem(word, word.length); + } + + /** Stem a word contained in a portion of a char[] array. Returns + * true if the stemming process resulted in a word different from + * the input. You can retrieve the result with + * getResultLength()/getResultBuffer() or toString(). + */ + public boolean stem(char[] wordBuffer, int offset, int wordLen) { + reset(); + if (b.length < wordLen) { + b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)]; + } + System.arraycopy(wordBuffer, offset, b, 0, wordLen); + i = wordLen; + return stem(0); + } + + /** Stem a word contained in a leading portion of a char[] array. + * Returns true if the stemming process resulted in a word different + * from the input. You can retrieve the result with + * getResultLength()/getResultBuffer() or toString(). + */ + public boolean stem(char[] word, int wordLen) { + return stem(word, 0, wordLen); + } + + /** Stem the word placed into the Stemmer buffer through calls to add(). + * Returns true if the stemming process resulted in a word different + * from the input. You can retrieve the result with + * getResultLength()/getResultBuffer() or toString(). + */ + public boolean stem() { + return stem(0); + } + + public boolean stem(int i0) { + k = i - 1; + k0 = i0; + if (k > k0+1) { + step1(); step2(); step3(); step4(); step5(); step6(); + } + // Also, a word is considered dirty if we lopped off letters + // Thanks to Ifigenia Vairelles for pointing this out. + if (i != k+1) + dirty = true; + i = k+1; + return dirty; + } + + /** Test program for demonstrating the Stemmer. It reads a file and + * stems each word, writing the result to standard out. + * Usage: Stemmer file-name + */ + public static void main(String[] args) { + PStemmer s = new PStemmer(); + + for (int i = 0; i < args.length; i++) { + try { + InputStream in = new FileInputStream(args[i]); + byte[] buffer = new byte[1024]; + int bufferLen, offset, ch; + + bufferLen = in.read(buffer); + offset = 0; + s.reset(); + + while(true) { + if (offset < bufferLen) + ch = buffer[offset++]; + else { + bufferLen = in.read(buffer); + offset = 0; + if (bufferLen < 0) + ch = -1; + else + ch = buffer[offset++]; + } + + if (Character.isLetter((char) ch)) { + s.add(Character.toLowerCase((char) ch)); + } + else { + s.stem(); + System.out.print(s.toString()); + s.reset(); + if (ch < 0) + break; + else { + System.out.print((char) ch); + } + } + } + + in.close(); + } + catch (IOException e) { + System.out.println("error reading " + args[i]); + } + } + } + } +
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/word2vec/W2VDistanceMeasurer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/word2vec/W2VDistanceMeasurer.java b/opennlp-similarity/src/main/java/opennlp/tools/word2vec/W2VDistanceMeasurer.java new file mode 100644 index 0000000..f673929 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/word2vec/W2VDistanceMeasurer.java @@ -0,0 +1,124 @@ +package opennlp.tools.word2vec; + +import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; + +import org.deeplearning4j.models.embeddings.WeightLookupTable; +import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable; +import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; +import org.deeplearning4j.models.word2vec.Word2Vec; +import org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache; +import org.deeplearning4j.text.sentenceiterator.SentenceIterator; +import org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator; +import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; +import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory; +import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; +import org.springframework.core.io.ClassPathResource; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; + +public class W2VDistanceMeasurer { + static W2VDistanceMeasurer instance; + public Word2Vec vec = null; + private String resourceDir = null; + + public synchronized static W2VDistanceMeasurer getInstance() { + if (instance == null) + instance = new W2VDistanceMeasurer(); + return instance; + } + + public W2VDistanceMeasurer(){ + if (resourceDir ==null) + try { + resourceDir = new File( "." ).getCanonicalPath()+"/src/test/resources"; + } catch (IOException e) { + e.printStackTrace(); + vec = null; + return; + } + + String pathToW2V = resourceDir + "/w2v/GoogleNews-vectors-negative300.bin.gz"; + File gModel = new File(pathToW2V); + try { + vec = WordVectorSerializer.loadGoogleModel(gModel, true); + } catch (IOException e) { + System.out.println("Word2vec model is not loaded"); + vec = null; + return; + } + + } + + public static void main(String[] args){ + + W2VDistanceMeasurer vw2v = W2VDistanceMeasurer.getInstance(); + + double value = vw2v.vec.similarity("product", "item"); + System.out.println(value); + } + + + public static void runCycle() { + + String filePath=null; + try { + filePath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath(); + } catch (IOException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + + System.out.println("Load & Vectorize Sentences...."); + // Strip white space before and after for each line + SentenceIterator iter=null; + try { + iter = UimaSentenceIterator.createWithPath(filePath); + } catch (Exception e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + // Split on white spaces in the line to get words + TokenizerFactory t = new DefaultTokenizerFactory(); + t.setTokenPreProcessor(new CommonPreprocessor()); + + InMemoryLookupCache cache = new InMemoryLookupCache(); + WeightLookupTable table = new InMemoryLookupTable.Builder() + .vectorLength(100) + .useAdaGrad(false) + .cache(cache) + .lr(0.025f).build(); + + System.out.println("Building model...."); + Word2Vec vec = new Word2Vec.Builder() + .minWordFrequency(5).iterations(1) + .layerSize(100).lookupTable(table) + .stopWords(new ArrayList<String>()) + .vocabCache(cache).seed(42) + .windowSize(5).iterate(iter).tokenizerFactory(t).build(); + + System.out.println("Fitting Word2Vec model...."); + try { + vec.fit(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + System.out.println("Writing word vectors to text file...."); + // Write word + try { + WordVectorSerializer.writeWordVectors(vec, "pathToWriteto.txt"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + System.out.println("Closest Words:"); + Collection<String> lst = vec.wordsNearest("day", 10); + System.out.println(lst); + } +} + http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/fca/FCATest.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/test/java/opennlp/tools/fca/FCATest.java b/opennlp-similarity/src/test/java/opennlp/tools/fca/FCATest.java new file mode 100755 index 0000000..531e6ec --- /dev/null +++ b/opennlp-similarity/src/test/java/opennlp/tools/fca/FCATest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.fca; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Arrays; +import java.util.LinkedHashSet; + +import junit.framework.TestCase; + +public class FCATest extends TestCase{ + ConceptLattice cl=null; +/* + public void testConceptLattice() { + + + try { + cl = new ConceptLattice("src/test/resources/fca/sports.cxt",true); + cl.printLatticeStats(); + cl.printLatticeFull(); + cl.printBinContext(); + + FcaWriter wt = new FcaWriter(); + wt.WriteStatsToCvs("stats.csv", cl, 0); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + + e.printStackTrace(); + } + + FcaConverter converter = new FcaConverter(); + int [][] binCon = converter.latticeToContext(cl); + + if (binCon!=null){ + ConceptLattice new_cl = new ConceptLattice(binCon.length, binCon[0].length, binCon, false); + new_cl.printLatticeStats(); + new_cl.printLatticeFull(); + new_cl.printBinContext(); + FcaWriter wt = new FcaWriter(); + wt.WriteStatsToCvs("stats.txt", cl, 0); + //wt.WriteAsCxt("cl.cxt", cl); + wt.WriteAsCxt("cl_new.cxt", new_cl); + } + } + + public void testRandom(){ + RandomNoiseGenerator rng = new RandomNoiseGenerator(); + try { + cl = new ConceptLattice("src/test/resources/fca/sports.cxt",true); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + //int[][] bc = rng.AddObjectsAttributesWithProbability(10, 0.5, cl.binaryContext); + int[][] bc = rng.AlterCellsWithProbability(0.2, cl.binaryContext); + ConceptLattice new_cl = new ConceptLattice(bc.length, bc[0].length, bc, false); + new_cl.printLatticeStats(); + new_cl.printLattice(); + } +*/ +} + http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporterTest.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporterTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporterTest.java new file mode 100644 index 0000000..c2b5877 --- /dev/null +++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/external_rst/ExternalRSTImporterTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.parse_thicket.external_rst; + + +import java.util.List; + +import junit.framework.TestCase; +import opennlp.tools.parse_thicket.ParseThicket; +import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc; +import opennlp.tools.parse_thicket.matching.Matcher; + +public class ExternalRSTImporterTest extends TestCase{ + + + public void testBuildParseThicketFromTextWithRSTtest(){ + Matcher m = new Matcher(); + // We combine our own RST rules with those of Joty 2014 to produce an augmented parse thicket + String externalRSTresultFilename = "/external_rst/resInput.txt"; + + ParseThicket pt = m.buildParseThicketFromTextWithRST("I explained that I made a deposit, and then wrote a check, which bounced due to a bank error. A customer service representative confirmed that it usually takes a day to process the deposit. " + + "I reminded that I was unfairly charged an overdraft fee amonth ago in a similar situation. "+ + " They explained that the overdraft fee was due to insufficient funds as disclosed in my account information. I disagreed with their fee because I made a deposit well in "+ + " advance and wanted this fee back. They denied responsibility saying that nothing an be done at this point. They also confirmed that I needed to look into the account rules closer."); + ExternalRSTImporter imp = new ExternalRSTImporter(); + + List<WordWordInterSentenceRelationArc> arcsRST = imp.buildPT2ptPhrases( pt , externalRSTresultFilename); + assertTrue(arcsRST .size() > 10); + + + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/JSMLearnerOnLatticeTest.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/JSMLearnerOnLatticeTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/JSMLearnerOnLatticeTest.java new file mode 100644 index 0000000..fd989ba --- /dev/null +++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/JSMLearnerOnLatticeTest.java @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.parse_thicket.pattern_structure; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.collections.ListUtils; + +import junit.framework.TestCase; +import opennlp.tools.fca.ConceptLattice; +import opennlp.tools.fca.FcaWriter; +import opennlp.tools.fca.FormalConcept; +import opennlp.tools.similarity.apps.BingWebQueryRunner; +import opennlp.tools.similarity.apps.HitBase; +import opennlp.tools.similarity.apps.utils.Pair; +import opennlp.tools.textsimilarity.ParseTreeChunk; +import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; +import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic; +import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; + +public class JSMLearnerOnLatticeTest extends TestCase{ + ParserChunker2MatcherProcessor chunk_maker = ParserChunker2MatcherProcessor.getInstance(); + LinguisticPatternStructure psPos = new LinguisticPatternStructure(0,0), psNeg = new LinguisticPatternStructure(0,0); + ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic(); + + public void testJSMLearner() { + + String text1p = "I rent an office space. This office is for my business. I can deduct office rental expense from my business profit to calculate net income. "; + String text2p = "To run my business, I have to rent an office. The net business profit is calculated as follows. Rental expense needs to be subtracted from revenue. "; + String text3p = "To store goods for my retail business I rent some space. When I calculate the net income, I take revenue and subtract business expenses such as office rent. "; + String text4p = "I rent some space for my business. To calculate my net income, I subtract from revenue my rental business expense."; + + + String text1n = "I rent out a first floor unit of my house to a travel business. I need to add the rental income to my profit. However, when I repair my house, I can deduct the repair expense from my rental income. "; + String text2n = "I receive rental income from my office. I have to claim it as a profit in my tax forms. I need to add my rental income to my profits, but subtract rental expenses such as repair from it. "; + String text3n = "I advertised my property as a business rental. Advertisement and repair expenses can be subtracted from the rental income. Remaining rental income needs to be added to my profit and be reported as taxable profit. "; + String text4n = "I showed my property to a business owner to rent. Expenses on my time spent on advertisement are subtracted from the rental income. My rental profits are added to my taxable income. "; + + List<List<ParseTreeChunk>> chunks1p = chunk_maker.formGroupedPhrasesFromChunksForPara(text1p); + List<List<ParseTreeChunk>> chunks2p = chunk_maker.formGroupedPhrasesFromChunksForPara(text2p); + List<List<ParseTreeChunk>> chunks3p = chunk_maker.formGroupedPhrasesFromChunksForPara(text3p); + List<List<ParseTreeChunk>> chunks4p = chunk_maker.formGroupedPhrasesFromChunksForPara(text4p); + List<List<ParseTreeChunk>> chunks1n = chunk_maker.formGroupedPhrasesFromChunksForPara(text1n); + List<List<ParseTreeChunk>> chunks2n = chunk_maker.formGroupedPhrasesFromChunksForPara(text2n); + List<List<ParseTreeChunk>> chunks3n = chunk_maker.formGroupedPhrasesFromChunksForPara(text3n); + List<List<ParseTreeChunk>> chunks4n = chunk_maker.formGroupedPhrasesFromChunksForPara(text4n); + + + LinkedHashSet<Integer> obj = null; + obj = new LinkedHashSet<Integer>(); + obj.add(0); + psPos.AddIntent(chunks1p, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(1); + psPos.AddIntent(chunks2p, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(2); + psPos.AddIntent(chunks3p, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(3); + psPos.AddIntent(chunks4p, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(0); + psNeg.AddIntent(chunks1n, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(1); + psNeg.AddIntent(chunks2n, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(2); + psNeg.AddIntent(chunks3n, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(3); + psNeg.AddIntent(chunks4n, obj, 0); + + String unknown = "I do not want to rent anything to anyone. I just want to rent a space for myself. I neither calculate deduction of individual or business tax. I subtract my tax from my income"; + List<List<ParseTreeChunk>> chunksUnknown = chunk_maker.formGroupedPhrasesFromChunksForPara(unknown); + List<List<List<ParseTreeChunk>>> posIntersections = new ArrayList<List<List<ParseTreeChunk>>>(), + negIntersections = new ArrayList<List<List<ParseTreeChunk>>>(); + List<List<ParseTreeChunk>> intersection = null; + for(int iConcept = 0; iConcept<psPos.conceptList.size(); iConcept++){ + if (psPos.conceptList.get(iConcept).intent!=null && psPos.conceptList.get(iConcept).intent.size()>0){ + intersection = md + .matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent, chunksUnknown); + if (reduceList(intersection).size()>0) + posIntersections.add(reduceList(intersection)); + } + if (psNeg.conceptList.get(iConcept).intent!=null && psNeg.conceptList.get(iConcept).intent.size()>0){ + intersection = md + .matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent, chunksUnknown); + if (reduceList(intersection).size()>0) + negIntersections.add(reduceList(intersection)); + } + } + + Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>> pair = + removeInconsistenciesFromPosNegIntersections( posIntersections, + negIntersections); + + posIntersections = pair.getFirst(); + negIntersections = pair.getSecond(); + + List<List<List<ParseTreeChunk>>> posIntersectionsUnderNeg = new ArrayList<List<List<ParseTreeChunk>>>(), + negIntersectionsUnderPos = new ArrayList<List<List<ParseTreeChunk>>>(); + + for(int iConcept = 0; iConcept<psNeg.conceptList.size(); iConcept++){ + for(int iConceptJ = 0; iConceptJ<negIntersections.size(); iConceptJ++){ + intersection = md + .matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent, negIntersections.get(iConceptJ)); + if (reduceList(intersection).size()>0) + posIntersectionsUnderNeg.add(reduceList(intersection)); + } + } + + for(int iConcept = 0; iConcept<psPos.conceptList.size(); iConcept++){ + for(int iConceptJ = 0; iConceptJ<posIntersections.size(); iConceptJ++){ + intersection = md + .matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent, posIntersections.get(iConceptJ)); + if (reduceList(intersection).size()>0) + negIntersectionsUnderPos.add(reduceList(intersection)); + } + } + + List<ParseTreeChunk>posIntersectionsUnderNegLst = flattenParseTreeChunkLst(posIntersectionsUnderNeg); + List<ParseTreeChunk>negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos); + + posIntersectionsUnderNegLst = subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst); + negIntersectionsUnderPosLst= subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst); + + System.out.println("Pos - neg inters = "+posIntersectionsUnderNegLst); + System.out.println("Neg - pos inters = "+negIntersectionsUnderPosLst); + + } + + public List<List<ParseTreeChunk>> reduceList(List<List<ParseTreeChunk>> list){ + float minScore = 1.3f; + List<List<ParseTreeChunk>> newList = new ArrayList<List<ParseTreeChunk>>(); + + + ParseTreeChunkListScorer scorer = new ParseTreeChunkListScorer(); + for( List<ParseTreeChunk> group: list){ + List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>(); + for(ParseTreeChunk ch: group){ + if (scorer.getScore(ch) > minScore) + newGroup.add(ch); + } + if (newGroup.size()>0) + newList.add(newGroup); + } + + return newList; + + } + + public List<List<ParseTreeChunk>> flattenParseTreeChunkListList(List<List<List<ParseTreeChunk>>> listOfLists){ + List<List<ParseTreeChunk>> newList = new ArrayList<List<ParseTreeChunk>>(); + + for( List<List<ParseTreeChunk>> member: listOfLists){ + Set<ParseTreeChunk> newSet= new HashSet<ParseTreeChunk>(); + for( List<ParseTreeChunk> group: member){ + if (group.size()>0) + newSet.addAll(group); + } + newList.add(new ArrayList<ParseTreeChunk>(newSet)); + } + + return newList; + } + + public List<ParseTreeChunk> flattenParseTreeChunkLst(List<List<List<ParseTreeChunk>>> listOfLists){ + List<ParseTreeChunk> newList = new ArrayList<ParseTreeChunk>(); + Set<ParseTreeChunk> newSetAll = new HashSet<ParseTreeChunk>(); + + + for( List<List<ParseTreeChunk>> member: listOfLists){ + Set<ParseTreeChunk> newSet= new HashSet<ParseTreeChunk>(); + for( List<ParseTreeChunk> group: member){ + if (group.size()>0) + newSet.addAll(group); + } + newSetAll.addAll(newSet); + } + + return removeDuplicates(new ArrayList<ParseTreeChunk>(newSetAll)); + } + + public List<ParseTreeChunk> removeDuplicates(List<ParseTreeChunk> dupes){ + List<Integer> toDelete = new ArrayList<Integer>(); + for(int i=0; i<dupes.size(); i++) + for(int j=i+1; j<dupes.size(); j++){ + if (dupes.get(i).equals(dupes.get(j))){ + toDelete.add(j); + } + } + List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>(); + for(int i=0; i<dupes.size(); i++){ + if (!toDelete.contains(i)) + cleaned.add(dupes.get(i)); + } + return cleaned; + } + + public List<ParseTreeChunk> subtract(List<ParseTreeChunk> main, List<ParseTreeChunk> toSubtract){ + List<Integer> toDelete = new ArrayList<Integer>(); + for(int i=0; i<main.size(); i++) + for(int j=0; j<toSubtract.size(); j++){ + if (main.get(i).equals(toSubtract.get(j))){ + toDelete.add(i); + } + } + List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>(); + for(int i=0; i<main.size(); i++){ + if (!toDelete.contains(i)) + cleaned.add(main.get(i)); + } + return cleaned; + } + public List<ParseTreeChunk> intesectParseTreeChunkLists(List<ParseTreeChunk> a, List<ParseTreeChunk> b){ + List<Integer> inters = new ArrayList<Integer>(); + for(int i=0; i<a.size(); i++) + for(int j=0; j<b.size(); j++){ + if (a.get(i).equals(b.get(j))){ + inters.add(i); + } + } + List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>(); + for(int i=0; i<a.size(); i++){ + if (inters.contains(i)) + cleaned.add(a.get(i)); + } + return cleaned; + } + + public Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>> + removeInconsistenciesFromPosNegIntersections(List<List<List<ParseTreeChunk>>> pos, + List<List<List<ParseTreeChunk>>> neg ){ + + List<ParseTreeChunk> posIntersectionsFl = flattenParseTreeChunkLst(pos); + List<ParseTreeChunk> negIntersectionsFl = flattenParseTreeChunkLst(neg); + + List<ParseTreeChunk> intersParseTreeChunkLists = intesectParseTreeChunkLists(posIntersectionsFl, negIntersectionsFl); + + List<List<List<ParseTreeChunk>>> cleanedFromInconsPos = new ArrayList<List<List<ParseTreeChunk>>>(), + cleanedFromInconsNeg = new ArrayList<List<List<ParseTreeChunk>>>(); + /* + System.out.println("pos = "+ pos); + System.out.println("neg = "+ neg); + System.out.println("pos flat = "+ posIntersectionsFl); + System.out.println("neg flat = "+ negIntersectionsFl); + System.out.println("inters = "+ intersParseTreeChunkLists); + */ + + for( List<List<ParseTreeChunk>> member: pos){ + List<List<ParseTreeChunk>> memberList = new ArrayList<List<ParseTreeChunk>>(); + for( List<ParseTreeChunk> group: member){ + List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>(); + for(ParseTreeChunk ch: group){ + boolean bSkip = false; + for(ParseTreeChunk check: intersParseTreeChunkLists){ + if (check.equals(ch)) + bSkip=true; + } + if (!bSkip) + newGroup.add(ch); + } + if (newGroup.size()>0) + memberList.add(newGroup); + } + if (memberList.size()>0) + cleanedFromInconsPos.add(memberList); + } + + for( List<List<ParseTreeChunk>> member: neg){ + List<List<ParseTreeChunk>> memberList = new ArrayList<List<ParseTreeChunk>>(); + for( List<ParseTreeChunk> group: member){ + List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>(); + for(ParseTreeChunk ch: group){ + boolean bSkip = false; + for(ParseTreeChunk check: intersParseTreeChunkLists){ + if (check.equals(ch)) + bSkip=true; + } + if (!bSkip) + newGroup.add(ch); + } + if (newGroup.size()>0) + memberList.add(newGroup); + } + if (memberList.size()>0) + cleanedFromInconsNeg.add(memberList); + } + + return new Pair(cleanedFromInconsPos, cleanedFromInconsNeg); + + } + + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureTest.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureTest.java new file mode 100644 index 0000000..47324a2 --- /dev/null +++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PatternStructureTest.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.parse_thicket.pattern_structure; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.List; + +import junit.framework.TestCase; +import opennlp.tools.fca.ConceptLattice; +import opennlp.tools.fca.FcaWriter; +import opennlp.tools.fca.FormalConcept; +import opennlp.tools.similarity.apps.BingWebQueryRunner; +import opennlp.tools.similarity.apps.HitBase; +import opennlp.tools.textsimilarity.ParseTreeChunk; +import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; + +public class PatternStructureTest extends TestCase{ + ParserChunker2MatcherProcessor chunk_maker = ParserChunker2MatcherProcessor.getInstance(); + LinguisticPatternStructure ps = new LinguisticPatternStructure(0,0); + BingWebQueryRunner bqr = new BingWebQueryRunner(); + + public void test6texts() { + + String text1 = "I rent an office space. This office is for my business. I can deduct office rental expense from my business profit to calculate net income."; + String text2 = "To run my business, I have to rent an office. The net business profit is calculated as follows. Rental expense needs to be subtracted from revenue."; + String text3 = "To store goods for my retail business I rent some space. When I calculate the net income, I take revenue and subtract business expenses such as office rent."; + String text4 = "I rent out a first floor unit of my house to a travel business. I need to add the rental income to my profit. However, when I repair my house, I can deduct the repair expense from my rental income."; + String text5 = "I receive rental income from my office. I have to claim it as a profit in my tax forms. I need to add my rental income to my profits, but subtract rental expenses such as repair from it."; + String text6 = "I advertised my property as a business rental. Advertisement and repair expenses can be subtracted from the rental income. Remaining rental income needs to be added to my profit and be reported as taxable profit. "; + + List<List<ParseTreeChunk>> chunks1 = chunk_maker.formGroupedPhrasesFromChunksForPara(text1); + List<List<ParseTreeChunk>> chunks2 = chunk_maker.formGroupedPhrasesFromChunksForPara(text2); + List<List<ParseTreeChunk>> chunks3 = chunk_maker.formGroupedPhrasesFromChunksForPara(text3); + List<List<ParseTreeChunk>> chunks4 = chunk_maker.formGroupedPhrasesFromChunksForPara(text4); + List<List<ParseTreeChunk>> chunks5 = chunk_maker.formGroupedPhrasesFromChunksForPara(text5); + List<List<ParseTreeChunk>> chunks6 = chunk_maker.formGroupedPhrasesFromChunksForPara(text6); + //ArrayList<ParseTreeChunk> lst = new ArrayList<ParseTreeChunk>(); + + + LinkedHashSet<Integer> obj = null; + obj = new LinkedHashSet<Integer>(); + obj.add(0); + ps.AddIntent(chunks1, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(1); + ps.AddIntent(chunks2, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(2); + ps.AddIntent(chunks3, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(3); + ps.AddIntent(chunks4, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(4); + ps.AddIntent(chunks5, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(5); + ps.AddIntent(chunks6, obj, 0); + + ps.logStability(); + ps.printLatticeExtended(); + + int [][] binaryContext = ps.toContext(3); + for (int i = 0; i < binaryContext.length; i++ ){ + System.out.println(Arrays.toString(binaryContext[i])); + } + + ConceptLattice new_cl = new ConceptLattice(binaryContext.length, binaryContext[0].length, binaryContext,true); + new_cl.printLatticeStats(); + new_cl.printLatticeFull(); + assertEquals(new_cl.getLattice().size(), 7); + assertEquals(new_cl.getAttributesCount(), 21); + assertEquals(new_cl.getObjectCount(), 3); + } + + // TEST 2 QUERY NEWS + public void testQueryNews(){ + List<List<ParseTreeChunk>> chunks = null; + BingWebQueryRunner bq = new BingWebQueryRunner(); + + String q = ""; +// q = "barack obama"; +// q = "lady gaga"; + q = "angela merkel"; +// q = "putin"; + ArrayList <HitBase> hb = (ArrayList<HitBase>) bq.runSearch(q, 10); + int cnt = 0; + for (HitBase news: hb){ + LinkedHashSet<Integer> obj = null; + obj = new LinkedHashSet<Integer>(); + obj.add(cnt); + chunks = chunk_maker.formGroupedPhrasesFromChunksForPara(news.getAbstractText()); + System.out.println(chunks); + ps.AddIntent(chunks,obj, 0); + cnt++; + } + + ps.logStability(); + System.out.println("LATTICE"); + ps.printLatticeExtended(); + + int [][] binaryContext = ps.toContext(cnt); + for (int i = 0; i < binaryContext.length; i++ ){ + System.out.println(Arrays.toString(binaryContext[i])); + } + + ConceptLattice new_cl = new ConceptLattice(binaryContext.length, binaryContext[0].length, binaryContext,true); + new_cl.printLatticeStats(); + new_cl.printLatticeFull(); + + FcaWriter wr = new FcaWriter(); + wr.WriteAsCxt("res.cxt", new_cl); + + System.out.println("Extent PS "+ps.conceptList.size()); + //for (int i = 0; i<ps.conceptList.size();i++){ + // System.out.println(ps.conceptList.get(i).extent); + //} + System.out.println("Extent CL "+new_cl.getLattice().size()); + //for (int i = 0; i<new_cl.getLattice().size();i++){ + // System.out.println(new_cl.getLattice().get(i).getExtent()); + //} + } + public void testNews(){ + List<List<ParseTreeChunk>> chunks = null; + + ArrayList <HitBase> result = (ArrayList<HitBase>) bqr.runSearch("site:http://news.yahoo.com " + "merkel", 10); + System.out.println(" ResultSize " + result.size()); + int ind = -1; + String text_result = ""; + for (int i = 0; i < result.size(); i++ ){ + System.out.println(result.get(i).getAbstractText()); + ind = result.get(i).getAbstractText().indexOf(") -"); + if (ind < 0) + ind = result.get(i)//.getDescription() + .getAbstractText().indexOf(") �"); + if (ind > 0) + text_result = result.get(i)//.getDescription() + .getAbstractText().substring(ind + 3); + else + text_result = result.get(i)//.getDescription() + .getAbstractText(); + + LinkedHashSet<Integer> obj = null; + obj = new LinkedHashSet<Integer>(); + obj.add(i); + chunks = chunk_maker.formGroupedPhrasesFromChunksForPara(text_result); + ps.AddIntent(chunks,obj, 0); + } + + ps.logStability(); + ps.printLatticeExtended(); + + int [][] binaryContext = ps.toContext(result.size()); + + ConceptLattice new_cl = new ConceptLattice(binaryContext.length, binaryContext[0].length, binaryContext,true); + + FcaWriter wt = new FcaWriter(); + wt.WriteStatsToTxt("merkel_stats.txt", new_cl, 0); + wt.WriteStatsToCvs("merkel_stats.csv", new_cl, ps.conceptList.size()); + wt.WriteAsCxt("merkel_lattice.cxt", new_cl); + + PatternStructureWriter pswt = new PatternStructureWriter(); + pswt.WriteStatsToTxt("ps_res.txt", ps); + + System.out.println("Extent PS "+ps.conceptList.size()); + System.out.println("Extent CL "+new_cl.getLattice().size()); + } + + + + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhraseTest.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhraseTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhraseTest.java new file mode 100755 index 0000000..58246e1 --- /dev/null +++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhraseTest.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.parse_thicket.pattern_structure; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.List; + +import opennlp.tools.fca.ConceptLattice; +import opennlp.tools.fca.FcaWriter; +import opennlp.tools.fca.FormalConcept; +import opennlp.tools.similarity.apps.BingWebQueryRunner; +import opennlp.tools.similarity.apps.HitBase; +import opennlp.tools.textsimilarity.ParseTreeChunk; +import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; + +public class PhraseTest { + + + public static void main(String []args) { + +/*//TEST 1 + String text1 = "Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " + + "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " + + "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "; + String text2 = "However, several experts familiar with the inspections believe that Iraq could also probably have produced a workable device in as little as 6 to 24 months, had they decided to seize foreign-supplied HEU from under safeguards and focus their efforts on a crash program to produce a device in the shortest possible amount of time."; + String text3 ="Iraq invested significant resources into uranium enrichment through laser isotope separation (LIS) involving both molecular (MLIS) and atomic vapor (AVLIS) technologies, including a number of activities with respect to laser component manufacture, particularly CO2 lasers and the manufacture of components for use in laser-related experimentation. The Laser Section within the Physics Department of the IAEC at Tuwaitha received an objective in 1981 from the IAEC to work in Laser Isotope Separation. It started in two lines; one which was looking after the molecular and the other the atomic vapor direction."; + ParserChunker2MatcherProcessor chunk_maker = ParserChunker2MatcherProcessor.getInstance(); + List<List<ParseTreeChunk>> chunks1 = chunk_maker.formGroupedPhrasesFromChunksForPara(text1); + List<List<ParseTreeChunk>> chunks2 = chunk_maker.formGroupedPhrasesFromChunksForPara(text2); + List<List<ParseTreeChunk>> chunks3 = chunk_maker.formGroupedPhrasesFromChunksForPara(text3); + ArrayList<ParseTreeChunk> lst = new ArrayList<ParseTreeChunk>(); + PhrasePatternStructureExtended ps = new PhrasePatternStructureExtended(0,0); + LinkedHashSet<Integer> obj = null; + obj = new LinkedHashSet<Integer>(); + obj.add(0); + ps.AddIntent(chunks1, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(1); + ps.AddIntent(chunks2, obj, 0); + obj = new LinkedHashSet<Integer>(); + obj.add(2); + ps.AddIntent(chunks3, obj, 0); + ps.logStability(); + ps.printLatticeExtended(); + + int [][] binaryContext = ps.toContext(3); + for (int i = 0; i < binaryContext.length; i++ ){ + System.out.println(Arrays.toString(binaryContext[i])); + } + + ConceptLattice new_cl = new ConceptLattice(binaryContext.length, binaryContext[0].length, binaryContext,true); + new_cl.printLatticeStats(); + new_cl.printLatticeFull(); +*/ + +/* // TEST 2 QUERY NEWS + ParserChunker2MatcherProcessor chunk_maker = ParserChunker2MatcherProcessor.getInstance(); + List<List<ParseTreeChunk>> chunks = null; + BingWebQueryRunner bq = new BingWebQueryRunner(); + + String q = ""; +// q = "barack obama"; +// q = "lady gaga"; + q = "angela merkel"; +// q = "putin"; + ArrayList <HitBase> hb = (ArrayList<HitBase>) bq.runSearch(q, 10); + PhrasePatternStructureExtended ps = new PhrasePatternStructureExtended(0,0); + int cnt = 0; + for (HitBase news: hb){ + LinkedHashSet<Integer> obj = null; + obj = new LinkedHashSet<Integer>(); + obj.add(cnt); + chunks = chunk_maker.formGroupedPhrasesFromChunksForPara(news.getDescription()); + System.out.println(chunks); + ps.AddIntent(chunks,obj, 0); + cnt++; + } + + ps.logStability(); + System.out.println("LATTICE"); + ps.printLatticeExtended(); + + int [][] binaryContext = ps.toContext(cnt); + for (int i = 0; i < binaryContext.length; i++ ){ + System.out.println(Arrays.toString(binaryContext[i])); + } + + ConceptLattice new_cl = new ConceptLattice(binaryContext.length, binaryContext[0].length, binaryContext,true); + new_cl.printLatticeStats(); + new_cl.printLatticeFull(); + + FcaWriter wr = new FcaWriter(); + wr.WriteAsCxt("res.cxt", new_cl); + + System.out.println("Extent PS "+ps.conceptList.size()); + //for (int i = 0; i<ps.conceptList.size();i++){ + // System.out.println(ps.conceptList.get(i).extent); + //} + System.out.println("Extent CL "+new_cl.getLattice().size()); + //for (int i = 0; i<new_cl.getLattice().size();i++){ + // System.out.println(new_cl.getLattice().get(i).getExtent()); + //} +*/ + LinguisticPatternStructure ps = new LinguisticPatternStructure(0,0); + ParserChunker2MatcherProcessor chunk_maker = ParserChunker2MatcherProcessor.getInstance(); + List<List<ParseTreeChunk>> chunks = null; + BingWebQueryRunner bqr = new BingWebQueryRunner(); + ArrayList <HitBase> result = (ArrayList<HitBase>) bqr.runSearch("site:http://news.yahoo.com " + "merkel", 10); + System.out.println(" ResultSize " + result.size()); + int ind = -1; + String text_result = ""; + for (int i = 0; i < result.size(); i++ ){ + System.out.println(result.get(i).getAbstractText()); + ind = result.get(i).getAbstractText().indexOf(") -"); + if (ind < 0) + ind = result.get(i)//.getDescription() + .getAbstractText().indexOf(") �"); + if (ind > 0) + text_result = result.get(i)//.getDescription() + .getAbstractText().substring(ind + 3); + else + text_result = result.get(i)//.getDescription() + .getAbstractText(); + + LinkedHashSet<Integer> obj = null; + obj = new LinkedHashSet<Integer>(); + obj.add(i); + chunks = chunk_maker.formGroupedPhrasesFromChunksForPara(text_result); + ps.AddIntent(chunks,obj, 0); + } + + ps.logStability(); + ps.printLatticeExtended(); + + int [][] binaryContext = ps.toContext(result.size()); + + ConceptLattice new_cl = new ConceptLattice(binaryContext.length, binaryContext[0].length, binaryContext,true); + + FcaWriter wt = new FcaWriter(); + wt.WriteStatsToTxt("merkel_stats.txt", new_cl, 0); + wt.WriteStatsToCvs("merkel_stats.csv", new_cl, ps.conceptList.size()); + wt.WriteAsCxt("merkel_lattice.cxt", new_cl); + + PatternStructureWriter pswt = new PatternStructureWriter(); + pswt.WriteStatsToTxt("ps_res.txt", ps); + + System.out.println("Extent PS "+ps.conceptList.size()); + System.out.println("Extent CL "+new_cl.getLattice().size()); + + + } + + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/resources/external_rst/resInput.txt ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/test/resources/external_rst/resInput.txt b/opennlp-similarity/src/test/resources/external_rst/resInput.txt new file mode 100644 index 0000000..995b706 --- /dev/null +++ b/opennlp-similarity/src/test/resources/external_rst/resInput.txt @@ -0,0 +1,62 @@ +( Root (span 1 21) + ( Nucleus (span 1 5) (rel2par span) + ( Nucleus (span 1 3) (rel2par Joint) + ( Satellite (leaf 1) (rel2par Attribution) (text _!I explained_!) ) + ( Nucleus (span 2 3) (rel2par span) + ( Nucleus (leaf 2) (rel2par span) (text _!that_!) ) + ( Satellite (leaf 3) (rel2par Attribution) (text _!I made a deposit ,_!) ) + ) + ) + ( Nucleus (span 4 5) (rel2par Joint) + ( Nucleus (leaf 4) (rel2par span) (text _!and then wrote a check ,_!) ) + ( Satellite (leaf 5) (rel2par Elaboration) (text _!which bounced due to a bank error ._!) ) + ) + ) + ( Satellite (span 6 21) (rel2par Summary) + ( Nucleus (span 6 8) (rel2par span) + ( Satellite (leaf 6) (rel2par Attribution) (text _!A customer service representative confirmed_!) ) + ( Nucleus (span 7 8) (rel2par span) + ( Nucleus (leaf 7) (rel2par span) (text _!that it usually takes a day_!) ) + ( Satellite (leaf 8) (rel2par Enablement) (text _!to process the deposit ._!) ) + ) + ) + ( Satellite (span 9 21) (rel2par Elaboration) + ( Nucleus (span 9 16) (rel2par span) + ( Nucleus (span 9 13) (rel2par span) + ( Nucleus (span 9 10) (rel2par span) + ( Satellite (leaf 9) (rel2par Attribution) (text _!I reminded_!) ) + ( Nucleus (leaf 10) (rel2par span) (text _!that I was unfairly charged an overdraft fee a month ago in a similar situation ._!) ) + ) + ( Satellite (span 11 13) (rel2par Elaboration) + ( Satellite (leaf 11) (rel2par Attribution) (text _!They explained_!) ) + ( Nucleus (span 12 13) (rel2par span) + ( Nucleus (leaf 12) (rel2par span) (text _!that the overdraft fee was due to insufficient funds_!) ) + ( Satellite (leaf 13) (rel2par Comparison) (text _!as disclosed in my account information ._!) ) + ) + ) + ) + ( Satellite (span 14 16) (rel2par Elaboration) + ( Nucleus (leaf 14) (rel2par span) (text _!I disagreed with their fee_!) ) + ( Satellite (span 15 16) (rel2par Explanation) + ( Nucleus (leaf 15) (rel2par Joint) (text _!because I made a deposit well in advance_!) ) + ( Nucleus (leaf 16) (rel2par Joint) (text _!and wanted this fee back ._!) ) + ) + ) + ) + ( Satellite (span 17 21) (rel2par Topic-Comment) + ( Nucleus (span 17 19) (rel2par Joint) + ( Nucleus (leaf 17) (rel2par span) (text _!They denied responsibility_!) ) + ( Satellite (span 18 19) (rel2par Elaboration) + ( Satellite (leaf 18) (rel2par Attribution) (text _!saying_!) ) + ( Nucleus (leaf 19) (rel2par span) (text _!that nothing can be done at this point ._!) ) + ) + ) + ( Nucleus (span 20 21) (rel2par Joint) + ( Satellite (leaf 20) (rel2par Attribution) (text _!They also confirmed_!) ) + ( Nucleus (leaf 21) (rel2par span) (text _!that I needed to look into the account rules closer ._!) ) + ) + ) + ) + ) +) + http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/resources/fca/sports.cxt ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/test/resources/fca/sports.cxt b/opennlp-similarity/src/test/resources/fca/sports.cxt new file mode 100755 index 0000000..038dc49 --- /dev/null +++ b/opennlp-similarity/src/test/resources/fca/sports.cxt @@ -0,0 +1,55 @@ +B + +20 +10 + +Obj 1 +Obj 2 +Obj 3 +Obj 4 +Obj 5 +Obj 6 +Obj 7 +Obj 8 +Obj 9 +Obj 10 +Obj 11 +Obj 12 +Obj 13 +Obj 14 +Obj 15 +Obj 16 +Obj 17 +Obj 18 +Obj 19 +Obj 20 +Attr 1 +Attr 2 +Attr 3 +Attr 4 +Attr 5 +Attr 6 +Attr 7 +Attr 8 +Attr 9 +Attr 10 +X...X....X +X...X....X +X...X..XX. +X.X.X..X.X +X..X.XX.X. +X..X.XX.X. +X..X.XX.X. +X..X.XX.X. +.X.X..X.X. +.X.X....X. +.X..X....X +.X.X....X. +.X..X....X +.X..X..X.X +.X..X....X +..XX...XX. +..X.X...X. +..XX.XX.X. +..X.X...X. +..XX.....X http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/test/resources/new_vn.zip ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/test/resources/new_vn.zip b/opennlp-similarity/src/test/resources/new_vn.zip new file mode 100644 index 0000000..cf0b9bc Binary files /dev/null and b/opennlp-similarity/src/test/resources/new_vn.zip differ
