BasicSummarizer.java

jerome Sat, 13 May 2006 01:52:42 -0700

Author: jerome
Date: Sat May 13 01:52:20 2006
New Revision: 406048

URL: http://svn.apache.org/viewcvs?rev=406048&view=rev
Log:
NUTCH-134 : The Basic Summarizer now select the best snippets


Modified:
    
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java

Modified: 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java?rev=406048&r1=406047&r2=406048&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
 Sat May 13 01:52:20 2006
@@ -22,9 +22,11 @@
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.Enumeration;
 import java.util.HashSet;
+import java.util.List;
 import java.util.SortedSet;
 import java.util.TreeSet;
 import java.util.Vector;
@@ -56,6 +58,38 @@
   private Analyzer analyzer = null;
   private Configuration conf = null;
   
+  private final static Comparator ORDER_COMPARATOR = new Comparator() {
+    public int compare(Object o1, Object o2) {
+      return ((Excerpt) o1).getOrder() - ((Excerpt) o2).getOrder();
+    }
+  };
+  
+  private final static Comparator SCORE_COMPARATOR = new Comparator() {
+    public int compare(Object o1, Object o2) {
+      Excerpt excerpt1 = (Excerpt) o1;
+      Excerpt excerpt2 = (Excerpt) o2;
+
+      if (excerpt1 == null && excerpt2 != null) {
+        return -1;
+      } else if (excerpt1 != null && excerpt2 == null) {
+        return 1;
+      } else if (excerpt1 == null && excerpt2 == null) {
+        return 0;
+      }
+
+      int numToks1 = excerpt1.numUniqueTokens();
+      int numToks2 = excerpt2.numUniqueTokens();
+
+      if (numToks1 < numToks2) {
+        return -1;
+      } else if (numToks1 == numToks2) {
+        return excerpt1.numFragments() - excerpt2.numFragments();
+      } else {
+        return 1;
+      }
+    }
+  };
+
   
   public BasicSummarizer() { }
   
@@ -105,37 +139,9 @@
     for (int i = 0; i < terms.length; i++)
       highlight.add(terms[i]);
     
-    //
-    // Create a SortedSet that ranks excerpts according to
-    // how many query terms are present.  An excerpt is
-    // a Vector full of Fragments and Highlights
-    //
-    SortedSet excerptSet = new TreeSet(new Comparator() {
-      public int compare(Object o1, Object o2) {
-        Excerpt excerpt1 = (Excerpt) o1;
-        Excerpt excerpt2 = (Excerpt) o2;
-        
-        if (excerpt1 == null && excerpt2 != null) {
-          return -1;
-        } else if (excerpt1 != null && excerpt2 == null) {
-          return 1;
-        } else if (excerpt1 == null && excerpt2 == null) {
-          return 0;
-        }
-        
-        int numToks1 = excerpt1.numUniqueTokens();
-        int numToks2 = excerpt2.numUniqueTokens();
-        
-        if (numToks1 < numToks2) {
-          return -1;
-        } else if (numToks1 == numToks2) {
-          return excerpt1.numFragments() - excerpt2.numFragments();
-        } else {
-          return 1;
-        }
-      }
-    }
-    );
+    // A list to store document's excerpts.
+    // (An excerpt is a Vector full of Fragments and Highlights)
+    List excerpts = new ArrayList();
     
     //
     // Iterate through all terms in the document
@@ -160,7 +166,7 @@
         // terms all the way.  The end of the passage is always
         // SUM_CONTEXT beyond the last query-term.
         //
-        Excerpt excerpt = new Excerpt();
+        Excerpt excerpt = new Excerpt(i);
         if (i != 0) {
           excerpt.add(new Summary.Ellipsis());
         }
@@ -209,7 +215,7 @@
         //
         // Store the excerpt for later sorting
         //
-        excerptSet.add(excerpt);
+        excerpts.add(excerpt);
         
         //
         // Start SUM_CONTEXT places away.  The next
@@ -219,30 +225,46 @@
       }
     }
     
+    // Sort the excerpts based on their score
+    Collections.sort(excerpts, SCORE_COMPARATOR);
+    
     //
     // If the target text doesn't appear, then we just
     // excerpt the first SUM_LENGTH words from the document.
     //
-    if (excerptSet.size() == 0) {
-      Excerpt excerpt = new Excerpt();
+    if (excerpts.size() == 0) {
+      Excerpt excerpt = new Excerpt(0);
       int excerptLen = Math.min(sumLength, tokens.length);
       lastExcerptPos = excerptLen;
       
       excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), 
tokens[excerptLen-1].startOffset())));
       excerpt.setNumTerms(excerptLen);
-      excerptSet.add(excerpt);
+      excerpts.add(excerpt);
     }
     
     //
     // Now choose the best items from the excerpt set.
-    // Stop when our Summary grows too large.
+    // Stop when we have enought excerpts to build our Summary.
     //
     double tokenCount = 0;
+    int numExcerpt = excerpts.size()-1;
+    List bestExcerpts = new ArrayList();
+    while (tokenCount <= sumLength && numExcerpt >= 0) {
+      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
+      bestExcerpts.add(excerpt);
+      tokenCount += excerpt.getNumTerms();
+    }    
+    // Sort the best excerpts based on their natural order
+    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
+    
+    //
+    // Now build our Summary from the best the excerpts.
+    //
+    tokenCount = 0;
+    numExcerpt = 0;
     Summary s = new Summary();
-    while (tokenCount <= sumLength && excerptSet.size() > 0) {
-      Excerpt excerpt = (Excerpt) excerptSet.last();
-      excerptSet.remove(excerpt);
-      
+    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
+      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
       double tokenFraction = (1.0 * excerpt.getNumTerms()) / 
excerpt.numFragments();
       for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
         Fragment f = (Fragment) e.nextElement();
@@ -272,10 +294,12 @@
     Vector passages = new Vector();
     SortedSet tokenSet = new TreeSet();
     int numTerms = 0;
+    int order = 0;
     
     /**
      */
-    public Excerpt() {
+    public Excerpt(int order) {
+      this.order = order;
     }
     
     /**
@@ -300,6 +324,10 @@
     
     public void setNumTerms(int numTerms) {
       this.numTerms = numTerms;
+    }
+    
+    public int getOrder() {
+      return order;
     }
     
     public int getNumTerms() {

svn commit: r406048 - /lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java

Reply via email to