src...

joern Wed, 11 Feb 2015 00:53:32 -0800

Added: 
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java?rev=1658901&view=auto
==============================================================================
--- 
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
 (added)
+++ 
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
 Wed Feb 11 08:53:14 2015
@@ -0,0 +1,298 @@
+/*
+       * Licensed to the Apache Software Foundation (ASF) under one or more
+       * contributor license agreements. See the NOTICE file distributed with
+       * this work for additional information regarding copyright ownership.
+       * The ASF licenses this file to You under the Apache License, Version 
2.0
+       * (the "License"); you may not use this file except in compliance with
+       * the License. You may obtain a copy of the License at
+       *
+       * http://www.apache.org/licenses/LICENSE-2.0
+       *
+       * Unless required by applicable law or agreed to in writing, software
+       * distributed under the License is distributed on an "AS IS" BASIS,
+       * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
+       * See the License for the specific language governing permissions and
+       * limitations under the License.
+*/
+
+package opennlp.summarization.textrank;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.List;
+
+import opennlp.summarization.*;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import opennlp.summarization.preprocess.IDFWordWeight;
+import opennlp.summarization.preprocess.PorterStemmer;
+import opennlp.summarization.preprocess.StopWords;
+import opennlp.summarization.preprocess.WordWeight;
+
+/*
+ * Implements the TextRank algorithm by Mihalcea et al. 
+ * This basically applies the page rank algorithm to a graph where each 
sentence is a node and a connection between sentences
+ * indicates that a word is shared between them. It returns a ranking of 
sentences where highest rank means most important etc.
+ * Currently only stemming is done to the words - a more sophisticated way 
might use a resource like Wordnet to match synonyms etc.  
+ */
+public class TextRank {
+       private StopWords sw;
+       private String article;
+       private Hashtable<Integer, List<Integer>> links;
+       private List<String> sentences = new ArrayList<String>();
+       private List<String> processedSent = new ArrayList<String>();
+       private WordWeight wordWt;
+       private int NO_OF_IT = 100;
+       private double maxErr = 0.1;
+       private DocProcessor docProc;
+
+       private double title_wt = 0;
+       private Hashtable<Integer, String[]> wordsInSent;
+
+       // DAMPING FACTOR..
+       private static double df = 0.15;
+       private boolean HIGHER_TITLE_WEIGHT = true;
+       private static double TITLE_WRD_WT = 2d;
+       private String resources = "./resources";
+
+       public TextRank(DocProcessor dp) {
+               sw = new StopWords();
+               setLinks(new Hashtable<Integer, List<Integer>>());
+               processedSent = new ArrayList<String>();
+               docProc = dp;
+               wordWt = IDFWordWeight.getInstance(resources + "/idf.csv");
+       }
+
+       public TextRank(StopWords sw, WordWeight wordWts) {
+               this.sw = sw;
+               this.wordWt = wordWts;
+       }
+
+       // Returns similarity of two sentences. Wrd wts contains tf-idf of the
+       // words..
+       public double getWeightedSimilarity(String sent1, String sent2,
+                       Hashtable<String, Double> wrdWts) {
+               String[] words1 = sent1.split(" ");
+               String[] words2 = sent2.split(" ");
+               double wordsInCommon = 0;
+               Hashtable<String, Boolean> dups = new Hashtable<String, 
Boolean>();
+               for (int i = 0; i < words1.length; i++) {
+                       String currWrd1 = words1[i].trim();
+                       // skip over duplicate words of sentence
+                       if (dups.get(currWrd1) == null) {
+                               dups.put(currWrd1, true);
+                               for (int j = 0; j < words2.length; j++) {
+                                       if (!sw.isStopWord(currWrd1) && 
!currWrd1.isEmpty()
+                                                       && 
words1[i].equals(words2[j])) {
+                                               Double wt;
+
+                                               wt = wrdWts.get(currWrd1);
+                                               if (wt != null)
+                                                       wordsInCommon += 
wt.doubleValue();
+                                               else
+                                                       wordsInCommon++;
+                                       }
+                               }
+                       }
+               }
+               return ((double) ((wordsInCommon)))
+                               /  (words1.length  +  words2.length);
+       }
+
+       // Gets the current score from the list of scores passed ...
+       public double getScoreFrom(List<Score> scores, int id) {
+               for (Score s : scores) {
+                       if (s.getSentId() == id)
+                               return s.getScore();
+               }
+               return 1;
+       }
+
+       // This method runs the page rank algorithm for the sentences.
+       // TR(Vi) = (1-d) + d * sigma over neighbors Vj( wij/sigma over k 
neighbor
+       // of j(wjk) * PR(Vj) )
+       public List<Score> getTextRankScore(List<Score> rawScores,
+                       List<String> sentences, Hashtable<String, Double> 
wrdWts) {
+               List<Score> currWtScores = new ArrayList<Score>();
+               // Start with equal weights for all sentences
+               for (int i = 0; i < rawScores.size(); i++) {
+                       Score ns = new Score();
+                       ns.setSentId(rawScores.get(i).getSentId());
+                       ns.setScore((1 - title_wt) / (rawScores.size()));// 
this.getSimilarity();
+                       currWtScores.add(ns);
+               }
+               // currWtScores.get(0).score = this.title_wt;
+
+               // Page rank..
+               for (int i = 0; i < NO_OF_IT; i++) {
+                       double totErr = 0;
+                       List<Score> newWtScores = new ArrayList<Score>();
+
+                       // Update the scores for the current iteration..
+                       for (Score rs : rawScores) {
+                               int sentId = rs.getSentId();
+                               Score ns = new Score();
+                               ns.setSentId(sentId);
+
+                               List<Integer> neighbors = 
getLinks().get(sentId);
+                               double sum = 0;
+                               if (neighbors != null) {
+                                       for (Integer j : neighbors) {
+                                               // sum += 
getCurrentScore(rawScores,
+                                               // 
sentId)/(getCurrentScore(rawScores, neigh)) *
+                                               // 
getCurrentScore(currWtScores, neigh);
+                                               double wij = 
this.getWeightedSimilarity(sentences
+                                                               .get(sentId), 
sentences.get(j), wrdWts);
+                                               double sigmawjk = 
getScoreFrom(rawScores, j);
+                                               double txtRnkj = 
getScoreFrom(currWtScores, j);
+                                               sum += wij / sigmawjk * txtRnkj;
+                                       }
+                               }
+                               ns.setScore((1d - df) + sum * df);// * rs.score
+                               totErr += ns.getScore() - 
getScoreFrom(rawScores, sentId);
+                               newWtScores.add(ns);
+                       }
+                       currWtScores = newWtScores;
+                       if (i > 2 && totErr / rawScores.size() < maxErr)
+                               break;
+               }
+
+               for (int i = 0; i < currWtScores.size(); i++) {
+                       Score s = currWtScores.get(i);
+                       s.setScore(s.getScore() * getScoreFrom(rawScores, 
s.getSentId()));
+               }
+               return currWtScores;
+       }
+
+       // Raw score is sigma wtsimilarity of neighbors..
+       // Used in the denominator of the Text rank formula..
+       public List<Score> getNeighborsSigmaWtSim(List<String> sentences,
+                       Hashtable<String, List<Integer>> iidx, 
Hashtable<String, Double> wts) {
+               List<Score> allScores = new ArrayList<Score>();
+
+               for (int i = 0; i < sentences.size(); i++) {
+                       String nextSent = sentences.get(i);
+                       String[] words = nextSent.split(" ");
+                       List<Integer> processed = new ArrayList<Integer>();
+                       Score s = new Score();
+                       s.setSentId(i);
+
+                       for (int j = 0; j < words.length; j++) {
+                               String currWrd = 
docProc.getStemmer().stem(words[j]).toString();//stemmer.toString();
+                               
+                               List<Integer> otherSents = iidx.get(currWrd);
+                               if (otherSents == null)
+                                       continue;
+
+                               for (int k = 0; k < otherSents.size(); k++) {
+                                       int idx = otherSents.get(k);
+
+                                       if (idx != i && 
!processed.contains(idx)) {
+                                               double currS = 
getWeightedSimilarity(sentences.get(i),
+                                                               
sentences.get(idx), wts);
+                                               s.setScore(s.getScore() + 
currS);
+
+                                               if (currS > 0) {
+                                                       addLink(i, idx);
+                                               }
+                                               processed.add(idx);
+                                       }
+                               }
+                       }
+                       allScores.add(s);
+               }
+               return allScores;
+       }
+
+       public List<Score> getWeightedScores(List<Score> rawScores,
+                       List<String> sentences, Hashtable<String, Double> 
wordWts) {
+               List<Score> weightedScores = this.getTextRankScore(rawScores,
+                               sentences, wordWts);
+               Collections.sort(weightedScores);
+               return weightedScores;
+       }
+
+       private Hashtable<String, Double> toWordWtHashtable(WordWeight wwt,
+                       Hashtable<String, List<Integer>> iidx) {
+               Hashtable<String, Double> wrdWt = new Hashtable<String, 
Double>();
+               Enumeration<String> keys = iidx.keys();
+               while (keys.hasMoreElements()) {
+                       String key = keys.nextElement();
+                       wrdWt.put(key, wwt.getWordWeight(key));
+               }
+               return wrdWt;
+       }
+
+       public List<Score> getRankedSentences(String doc, List<String> 
sentences,
+                       Hashtable<String, List<Integer>> iidx, List<String> 
processedSent) {
+               this.sentences = sentences;
+               this.processedSent = processedSent;
+
+               List<Integer> chosenOnes = new ArrayList<Integer>();
+
+               Hashtable<String, Double> wrdWts = 
toWordWtHashtable(this.wordWt, iidx);// new
+                                                                               
                                                                                
// Hashtable<String,
+                                                                               
                                                                                
// Double>();
+
+               if (HIGHER_TITLE_WEIGHT && getSentences().size()>0) {
+                       String sent = getSentences().get(0);
+                       String[] wrds = sent.split(" ");
+                       for (String wrd : wrds)
+                               wrdWts.put(wrd, new Double(TITLE_WRD_WT));
+               }
+
+               List<Score> rawScores = getNeighborsSigmaWtSim(getSentences(), 
iidx,
+                               wrdWts);
+               List<Score> finalScores = getWeightedScores(rawScores, 
getSentences(),
+                               wrdWts);
+
+               Score bestScr = null;
+               int next = 0;
+
+               return finalScores;
+       }
+
+       // Set a link between two sentences..
+       private void addLink(int i, int idx) {
+               List<Integer> endNodes = getLinks().get(i);
+               if (endNodes == null)
+                       endNodes = new ArrayList<Integer>();
+               endNodes.add(idx);
+               getLinks().put(i, endNodes);
+       }
+
+       public void setSentences(List<String> sentences) {
+               this.sentences = sentences;
+       }
+
+       public List<String> getSentences() {
+               return sentences;
+       }
+
+       public void setArticle(String article) {
+               this.article = article;
+       }
+
+       public String getArticle() {
+               return article;
+       }
+
+       private void setLinks(Hashtable<Integer, List<Integer>> links) {
+               this.links = links;
+       }
+
+       public Hashtable<Integer, List<Integer>> getLinks() {
+               return links;
+       }
+}
+
+/*
+ * public double getScore(String sent1, String sent2, boolean toPrint) {
+ * String[] words1 = sent1.split(" "); String[] words2 = sent2.split(" ");
+ * double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int
+ * j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) &&
+ * !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { 
wordsInCommon+=
+ * wordWt.getWordWeight(words1[i]); } } } return ((double)wordsInCommon) /
+ * (Math.log(1+words1.length) + Math.log(1+words2.length)); }
+ */
\ No newline at end of file


Propchange: 
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java
------------------------------------------------------------------------------
    svn:executable = *

Added: 
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java?rev=1658901&view=auto
==============================================================================
--- 
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
 (added)
+++ 
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
 Wed Feb 11 08:53:14 2015
@@ -0,0 +1,142 @@
+/*
+       * Licensed to the Apache Software Foundation (ASF) under one or more
+       * contributor license agreements. See the NOTICE file distributed with
+       * this work for additional information regarding copyright ownership.
+       * The ASF licenses this file to You under the Apache License, Version 
2.0
+       * (the "License"); you may not use this file except in compliance with
+       * the License. You may obtain a copy of the License at
+       *
+       * http://www.apache.org/licenses/LICENSE-2.0
+       *
+       * Unless required by applicable law or agreed to in writing, software
+       * distributed under the License is distributed on an "AS IS" BASIS,
+       * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
+       * See the License for the specific language governing permissions and
+       * limitations under the License.
+*/
+
+package opennlp.summarization.textrank;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.PrintWriter;
+import java.util.*;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import opennlp.summarization.*;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+import opennlp.summarization.preprocess.IDFWordWeight;
+import opennlp.summarization.preprocess.WordWeight;
+
+/*
+ * A wrapper around the text rank algorithm.  This class 
+ * a) Sets up the data for the TextRank class 
+ * b) Takes the ranked sentences and does some basic rearranging (e.g. 
ordering) to provide a more reasonable summary.  
+ */
+public class TextRankSummarizer implements Summarizer
+{
+       //An optional file to store idf of words. If idf is not available it 
uses a default equal weight for all words.
+    private String idfFile = "resources/idf.csv";
+    public TextRankSummarizer() throws Exception
+    {
+    }
+ 
+    /*Sets up data and calls the TextRank algorithm..*/
+    public List<Score> rankSentences(String doc, List<Sentence> sentences, 
+                                                            DocProcessor dp, 
int maxWords )
+    { 
+        try {            
+           //Rank sentences    
+            TextRank summ = new TextRank(dp);
+            List<String> sentenceStrL = new ArrayList<String>();
+            List<String> processedSent = new ArrayList<String>();
+            Hashtable<String, List<Integer>> iidx = new Hashtable<String, 
List<Integer>>();
+       //     dp.getSentences(sentences, sentenceStrL, iidx, processedSent);
+            
+            for(Sentence s : sentences){               
+               sentenceStrL.add(s.getStringVal());
+               String stemmedSent = s.stem();
+               processedSent.add(stemmedSent);
+               
+               String[] wrds = stemmedSent.split(" ");
+               for(String w: wrds)
+               {
+                       if(iidx.get(w)!=null) 
+                               iidx.get(w).add(s.getSentId());
+                       else{
+                               List<Integer> l = new ArrayList<Integer>();
+                               l.add(s.getSentId());
+                               iidx.put(w, l);
+                       }
+               }
+            }        
+           
+            WordWeight wordWt = new IDFWordWeight(idfFile);////new 
+            
+           List<Score> finalScores = summ.getRankedSentences(doc, 
sentenceStrL, iidx, processedSent);
+           List<String> sentenceStrList = summ.getSentences();
+           
+          // SentenceClusterer clust = new SentenceClusterer();
+          //  clust.runClusterer(doc, summ.processedSent);
+                
+               Hashtable<Integer,List<Integer>> links= summ.getLinks();
+
+                       for(int i=0;i<sentences.size();i++)
+                       {
+                               Sentence st = sentences.get(i);
+                               
+                               //Add links..
+                               List<Integer> currLnks = links.get(i);
+                               if(currLnks==null) continue;
+                               for(int j=0;j<currLnks.size();j++)
+                               {
+                                       if(j<i) st.addLink(sentences.get(j));   
+                               }
+                       }
+                       
+                       for(int i=0;i<finalScores.size();i++)
+                       {
+                               Score s = finalScores.get(i);
+                               Sentence st = sentences.get(s.getSentId());
+                               st.setPageRankScore(s);
+                       }
+
+                       List<Score> reRank = finalScores;//reRank(sentences, 
finalScores, iidx, wordWt, maxWords);
+                       
+                       return reRank;
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               return null;
+    }
+
+    //Returns the summary as a string. 
+       @Override
+       public String summarize(String article, DocProcessor dp, int maxWords) {
+        List<Sentence> sentences = dp.getSentencesFromStr(article);        
+        List<Score> scores = this.rankSentences(article, sentences, dp, 
maxWords);
+        return scores2String(sentences, scores, maxWords);
+       }
+    
+       /* Use the page rank scores to determine the summary.*/
+    public String scores2String(List<Sentence> sentences, List<Score> scores, 
int maxWords)
+    {
+        StringBuffer b = new StringBuffer();
+       // for(int i=0;i< min(maxWords, scores.size()-1);i++)
+        int i=0;
+        while(b.length()< maxWords && i< scores.size())
+        {
+               String sent = 
sentences.get(scores.get(i).getSentId()).getStringVal();
+               b.append(sent + scores.get(i));
+               i++;
+        }
+        return b.toString();
+    }
+    
+}

Propchange: 
opennlp/sandbox/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java
------------------------------------------------------------------------------
    svn:executable = *

Added: opennlp/sandbox/summarizer/src/test/java/unittests/DocProcessorTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/test/java/unittests/DocProcessorTest.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/test/java/unittests/DocProcessorTest.java 
(added)
+++ opennlp/sandbox/summarizer/src/test/java/unittests/DocProcessorTest.java 
Wed Feb 11 08:53:14 2015
@@ -0,0 +1,49 @@
+/* 
+    * Licensed to the Apache Software Foundation (ASF) under one or more
+       * contributor license agreements. See the NOTICE file distributed with
+       * this work for additional information regarding copyright ownership.
+       * The ASF licenses this file to You under the Apache License, Version 
2.0
+       * (the "License"); you may not use this file except in compliance with
+       * the License. You may obtain a copy of the License at
+       *
+       * http://www.apache.org/licenses/LICENSE-2.0
+       *
+       * Unless required by applicable law or agreed to in writing, software
+       * distributed under the License is distributed on an "AS IS" BASIS,
+       * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
+       * See the License for the specific language governing permissions and
+       * limitations under the License.
+*/
+
+
+package unittests;
+
+import static org.junit.Assert.*;
+
+import org.junit.Assert.*;
+
+import java.io.UnsupportedEncodingException;
+import java.util.List;
+
+import opennlp.summarization.Sentence;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class DocProcessorTest {
+
+       @BeforeClass
+       public static void setUpBeforeClass() throws Exception {
+       }
+
+       @Test
+       public void testGetSentencesFromStr() {
+               String sentFragModel = "resources/en-sent.bin";
+               DefaultDocProcessor dp =new DefaultDocProcessor(sentFragModel);
+               String sent="This is a sentence, with some punctuations; to 
test if the sentence breaker can handle it! Is every thing working OK ? Yes.";
+               List<Sentence> doc = 
dp.getSentencesFromStr(sent);//dp.docToString(fileName);//
+               assertEquals(doc.size(),3);
+       }
+
+}

Added: opennlp/sandbox/summarizer/src/test/java/unittests/LexChainTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/test/java/unittests/LexChainTest.java?rev=1658901&view=auto
==============================================================================
--- opennlp/sandbox/summarizer/src/test/java/unittests/LexChainTest.java (added)
+++ opennlp/sandbox/summarizer/src/test/java/unittests/LexChainTest.java Wed 
Feb 11 08:53:14 2015
@@ -0,0 +1,125 @@
+/*
+       * Licensed to the Apache Software Foundation (ASF) under one or more
+       * contributor license agreements. See the NOTICE file distributed with
+       * this work for additional information regarding copyright ownership.
+       * The ASF licenses this file to You under the Apache License, Version 
2.0
+       * (the "License"); you may not use this file except in compliance with
+       * the License. You may obtain a copy of the License at
+       *
+       * http://www.apache.org/licenses/LICENSE-2.0
+       *
+       * Unless required by applicable law or agreed to in writing, software
+       * distributed under the License is distributed on an "AS IS" BASIS,
+       * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
+       * See the License for the specific language governing permissions and
+       * limitations under the License.
+*/
+
+package unittests;
+
+import static org.junit.Assert.*;
+import opennlp.summarization.Sentence;
+import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer;
+import opennlp.summarization.lexicalchaining.LexicalChain;
+import opennlp.summarization.lexicalchaining.*;
+import opennlp.summarization.lexicalchaining.Word;
+import opennlp.summarization.lexicalchaining.WordRelation;
+import opennlp.summarization.lexicalchaining.WordRelationshipDetermination;
+import opennlp.summarization.lexicalchaining.WordnetWord;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import edu.mit.jwi.item.IIndexWord;
+import edu.mit.jwi.item.POS;
+
+import java.util.Collections;
+import java.util.Hashtable;
+import java.util.List;
+
+public class LexChainTest {
+
+       @BeforeClass
+       public static void setUpBeforeClass() throws Exception {
+       }
+
+
+       @Test
+       public void testBuildLexicalChains() {
+               try {
+                       /*
+                       String article = "US President Barack Obama has 
welcomed an agreement between the US and Russia under which Syria's chemical 
weapons must be destroyed or removed by mid-2014 as an \"important step\"."
+                                       + "But a White House statement 
cautioned that the US expected Syria to live up to its public commitments. "
+                                       + "The US-Russian framework document 
stipulates that Syria must provide details of its stockpile within a week. "
+                                       + "If Syria fails to comply, the deal 
could be enforced by a UN resolution. "
+                                       + " China, France, the UK, the UN and 
Nato have all expressed satisfaction at the agreement. "
+                                       + " In Beijing, Foreign Minister Wang 
Yi said on Sunday that China welcomes the general agreement between the US and 
Russia.";
+*/
+                       String sentFragModel = "resources/en-sent.bin";
+                       DefaultDocProcessor dp =new 
DefaultDocProcessor(sentFragModel);
+                       String article = 
dp.docToString("/Users/ram/dev/summarizer/test/forram/technology/output/summary/9.txt");
+                       LexicalChainingSummarizer lcs;
+                       lcs = new 
LexicalChainingSummarizer(dp,"resources/en-pos-maxent.bin");
+
+                       long strt = System.currentTimeMillis();
+
+                       List<Sentence> sent = dp.getSentencesFromStr(article);
+                       List<LexicalChain> vh = lcs.buildLexicalChains(article, 
sent);
+                       Collections.sort(vh);
+                       
+                       List<Sentence> s = dp.getSentencesFromStr(article);
+                       Hashtable<String, Boolean> comp = new Hashtable<String, 
Boolean>(); 
+                       System.out.println(vh.size());
+                       POSTagger t = new 
OpenNLPPOSTagger(dp,"resources/en-pos-maxent.bin");
+                       System.out.println(t.getTaggedString(article));
+                       for(int i=vh.size()-1;i>=Math.max(vh.size()-50, 0);i--)
+                       {
+                               LexicalChain lc = vh.get(i);
+                               
+                               if(! 
(comp.containsKey(lc.getWord().get(0).getLexicon())))
+                               {
+                                       
comp.put(lc.getWord().get(0).getLexicon(), new Boolean(true));
+                                       for(int j=0;j<lc.getWord().size();j++)
+                                               
System.out.print(lc.getWord().get(j) + "-- ");
+                                       System.out.println(lc.score());
+                                       for(Sentence sid : lc.getSentences())
+                                       {
+                                               //if(sid>=0 && sid<s.size())
+                                               System.out.println(sid);
+                                       }
+                               }
+                               System.out.println("--------");
+                       }
+                       System.out.println((System.currentTimeMillis() - 
strt)/1000);
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               
+       }
+
+       @Test
+       public void testGetRelation() {
+               try {
+                       
+                       WordRelationshipDetermination lcs = new 
WordRelationshipDetermination();
+                       LexicalChain l = new LexicalChain();
+                       List<Word> words = lcs.getWordSenses("music");
+                       
+                       l.addWord(words.get(0));
+//                     int rel = lcs.getRelation(l, "nation");
+                       WordRelation rel2 = lcs.getRelation(l, "tune", true);
+                       WordRelation rel3 = lcs.getRelation(l, "vocal", true);
+                       System.out.println(rel2.relation);
+                       System.out.println(rel3.relation);
+       //              assertEquals(rel, 
LexicalChainingSummarizer.STRONG_RELATION);
+                       assertEquals( WordRelation.MED_RELATION, rel2.relation);
+                       assertEquals( WordRelation.MED_RELATION, rel3.relation);
+                       
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }               
+       }
+
+}

Added: 
opennlp/sandbox/summarizer/src/test/java/unittests/LexChainingKeywordExtractorTest.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/summarizer/src/test/java/unittests/LexChainingKeywordExtractorTest.java?rev=1658901&view=auto
==============================================================================
--- 
opennlp/sandbox/summarizer/src/test/java/unittests/LexChainingKeywordExtractorTest.java
 (added)
+++ 
opennlp/sandbox/summarizer/src/test/java/unittests/LexChainingKeywordExtractorTest.java
 Wed Feb 11 08:53:14 2015
@@ -0,0 +1,63 @@
+/*
+       * Licensed to the Apache Software Foundation (ASF) under one or more
+       * contributor license agreements. See the NOTICE file distributed with
+       * this work for additional information regarding copyright ownership.
+       * The ASF licenses this file to You under the Apache License, Version 
2.0
+       * (the "License"); you may not use this file except in compliance with
+       * the License. You may obtain a copy of the License at
+       *
+       * http://www.apache.org/licenses/LICENSE-2.0
+       *
+       * Unless required by applicable law or agreed to in writing, software
+       * distributed under the License is distributed on an "AS IS" BASIS,
+       * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
implied.
+       * See the License for the specific language governing permissions and
+       * limitations under the License.
+*/
+
+package unittests;
+
+import static org.junit.Assert.*;
+
+import java.util.List;
+
+import opennlp.summarization.Sentence;
+import opennlp.summarization.lexicalchaining.LexChainingKeywordExtractor;
+import opennlp.summarization.lexicalchaining.LexicalChain;
+import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer;
+import opennlp.summarization.preprocess.DefaultDocProcessor;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class LexChainingKeywordExtractorTest {
+
+       @BeforeClass
+       public static void setUpBeforeClass() throws Exception {
+       }
+
+       @Test
+       public void testGetKeywords() {
+               try {
+                       String sentFragModel = "resources/en-sent.bin";
+                       DefaultDocProcessor dp =new 
DefaultDocProcessor(sentFragModel);
+                       String article = 
dp.docToString("/Users/ram/dev/summarizer/test/forram/topnews/input/8.txt");
+                       LexicalChainingSummarizer lcs;
+                       lcs = new 
LexicalChainingSummarizer(dp,"resources/en-pos-maxent.bin");
+
+                       long strt = System.currentTimeMillis();
+
+                       List<Sentence> sent = dp.getSentencesFromStr(article);
+                       List<LexicalChain> vh = lcs.buildLexicalChains(article, 
sent);
+                       LexChainingKeywordExtractor ke = new 
LexChainingKeywordExtractor();
+                       List<String> keywords = ke.getKeywords(vh, 5);
+                       //lazy
+                       System.out.println(keywords);
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               
+       }
+
+}

svn commit: r1658901 [2/2] - in /opennlp/sandbox/summarizer: ./ src/ src/main/ src/main/java/ src/main/java/opennlp/ src/main/java/opennlp/summarization/ src/main/java/opennlp/summarization/lexicalchaining/ src/main/java/opennlp/summarization/meta/ src...

Reply via email to