Author: jerome Date: Sat May 13 01:52:20 2006 New Revision: 406048 URL: http://svn.apache.org/viewcvs?rev=406048&view=rev Log: NUTCH-134 : The Basic Summarizer now select the best snippets
Modified: lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java Modified: lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java?rev=406048&r1=406047&r2=406048&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java (original) +++ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java Sat May 13 01:52:20 2006 @@ -22,9 +22,11 @@ import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; +import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; import java.util.HashSet; +import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import java.util.Vector; @@ -56,6 +58,38 @@ private Analyzer analyzer = null; private Configuration conf = null; + private final static Comparator ORDER_COMPARATOR = new Comparator() { + public int compare(Object o1, Object o2) { + return ((Excerpt) o1).getOrder() - ((Excerpt) o2).getOrder(); + } + }; + + private final static Comparator SCORE_COMPARATOR = new Comparator() { + public int compare(Object o1, Object o2) { + Excerpt excerpt1 = (Excerpt) o1; + Excerpt excerpt2 = (Excerpt) o2; + + if (excerpt1 == null && excerpt2 != null) { + return -1; + } else if (excerpt1 != null && excerpt2 == null) { + return 1; + } else if (excerpt1 == null && excerpt2 == null) { + return 0; + } + + int numToks1 = excerpt1.numUniqueTokens(); + int numToks2 = excerpt2.numUniqueTokens(); + + if (numToks1 < numToks2) { + return -1; + } else if (numToks1 == numToks2) { + return excerpt1.numFragments() - excerpt2.numFragments(); + } else { + return 1; + } + } + }; + public BasicSummarizer() { } @@ -105,37 +139,9 @@ for (int i = 0; i < terms.length; i++) highlight.add(terms[i]); - // - // Create a SortedSet that ranks excerpts according to - // how many query terms are present. An excerpt is - // a Vector full of Fragments and Highlights - // - SortedSet excerptSet = new TreeSet(new Comparator() { - public int compare(Object o1, Object o2) { - Excerpt excerpt1 = (Excerpt) o1; - Excerpt excerpt2 = (Excerpt) o2; - - if (excerpt1 == null && excerpt2 != null) { - return -1; - } else if (excerpt1 != null && excerpt2 == null) { - return 1; - } else if (excerpt1 == null && excerpt2 == null) { - return 0; - } - - int numToks1 = excerpt1.numUniqueTokens(); - int numToks2 = excerpt2.numUniqueTokens(); - - if (numToks1 < numToks2) { - return -1; - } else if (numToks1 == numToks2) { - return excerpt1.numFragments() - excerpt2.numFragments(); - } else { - return 1; - } - } - } - ); + // A list to store document's excerpts. + // (An excerpt is a Vector full of Fragments and Highlights) + List excerpts = new ArrayList(); // // Iterate through all terms in the document @@ -160,7 +166,7 @@ // terms all the way. The end of the passage is always // SUM_CONTEXT beyond the last query-term. // - Excerpt excerpt = new Excerpt(); + Excerpt excerpt = new Excerpt(i); if (i != 0) { excerpt.add(new Summary.Ellipsis()); } @@ -209,7 +215,7 @@ // // Store the excerpt for later sorting // - excerptSet.add(excerpt); + excerpts.add(excerpt); // // Start SUM_CONTEXT places away. The next @@ -219,30 +225,46 @@ } } + // Sort the excerpts based on their score + Collections.sort(excerpts, SCORE_COMPARATOR); + // // If the target text doesn't appear, then we just // excerpt the first SUM_LENGTH words from the document. // - if (excerptSet.size() == 0) { - Excerpt excerpt = new Excerpt(); + if (excerpts.size() == 0) { + Excerpt excerpt = new Excerpt(0); int excerptLen = Math.min(sumLength, tokens.length); lastExcerptPos = excerptLen; excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset()))); excerpt.setNumTerms(excerptLen); - excerptSet.add(excerpt); + excerpts.add(excerpt); } // // Now choose the best items from the excerpt set. - // Stop when our Summary grows too large. + // Stop when we have enought excerpts to build our Summary. // double tokenCount = 0; + int numExcerpt = excerpts.size()-1; + List bestExcerpts = new ArrayList(); + while (tokenCount <= sumLength && numExcerpt >= 0) { + Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--); + bestExcerpts.add(excerpt); + tokenCount += excerpt.getNumTerms(); + } + // Sort the best excerpts based on their natural order + Collections.sort(bestExcerpts, ORDER_COMPARATOR); + + // + // Now build our Summary from the best the excerpts. + // + tokenCount = 0; + numExcerpt = 0; Summary s = new Summary(); - while (tokenCount <= sumLength && excerptSet.size() > 0) { - Excerpt excerpt = (Excerpt) excerptSet.last(); - excerptSet.remove(excerpt); - + while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) { + Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++); double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) { Fragment f = (Fragment) e.nextElement(); @@ -272,10 +294,12 @@ Vector passages = new Vector(); SortedSet tokenSet = new TreeSet(); int numTerms = 0; + int order = 0; /** */ - public Excerpt() { + public Excerpt(int order) { + this.order = order; } /** @@ -300,6 +324,10 @@ public void setNumTerms(int numTerms) { this.numTerms = numTerms; + } + + public int getOrder() { + return order; } public int getNumTerms() { ------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs