Author: jerome Date: Mon May 8 14:04:01 2006 New Revision: 405165 URL: http://svn.apache.org/viewcvs?rev=405165&view=rev Log: NUTCH-134 : Added a summarizer extension point and two enxtensions: * summary-basic is the current nutch implementation moved into a plugin * summary-lucene a raw version of a summarizer plugin based on lucene highlighter
Added: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java (with props) lucene/nutch/trunk/src/plugin/summary-basic/ lucene/nutch/trunk/src/plugin/summary-basic/build.xml (with props) lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml (with props) lucene/nutch/trunk/src/plugin/summary-basic/src/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java (with props) lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html (with props) lucene/nutch/trunk/src/plugin/summary-lucene/ lucene/nutch/trunk/src/plugin/summary-lucene/build.xml (with props) lucene/nutch/trunk/src/plugin/summary-lucene/lib/ lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar (with props) lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml (with props) lucene/nutch/trunk/src/plugin/summary-lucene/src/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java (with props) lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html (with props) Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/default.properties lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=405165&r1=405164&r2=405165&view=diff ============================================================================== --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Mon May 8 14:04:01 2006 @@ -323,6 +323,8 @@ <packageset dir="${plugins.dir}/query-more/src/java"/> <packageset dir="${plugins.dir}/query-site/src/java"/> <packageset dir="${plugins.dir}/query-url/src/java"/> + <packageset dir="${plugins.dir}/summary-basic/src/java"/> + <packageset dir="${plugins.dir}/summary-lucene/src/java"/> <packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/> <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/> <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/> @@ -350,6 +352,7 @@ <group title="Analysis Plugins" packages="${plugins.analysis}"/> <group title="Indexing Filter Plugins" packages="${plugins.index}"/> <group title="Query Filter Plugins" packages="${plugins.query}"/> + <group title="Summary Plugins" packages="${plugins.summary}"/> <group title="Clustering Plugins" packages="${plugins.clustering}"/> <group title="Ontology Plugins" packages="${plugins.ontology}"/> <group title="Misc. Plugins" packages="${plugins.misc}"/> Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=405165&r1=405164&r2=405165&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon May 8 14:04:01 2006 @@ -564,7 +564,7 @@ <property> <name>plugin.includes</name> - <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)</value> + <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic</value> <description>Regular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By Modified: lucene/nutch/trunk/default.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=405165&r1=405164&r2=405165&view=diff ============================================================================== --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Mon May 8 14:04:01 2006 @@ -126,6 +126,12 @@ plugins.clustering=\ org.apache.nutch.clustering.carrot2* +# +# Summary Plugins +# +plugins.summary=\ + org.apache.nutch.summary.basic*:\ + org.apache.nutch.summary.lucene* # # Misc. Plugins Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=405165&r1=405164&r2=405165&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Mon May 8 14:04:01 2006 @@ -110,16 +110,12 @@ } private HashMap segments = new HashMap(); - private int sumContext = 5; - private int sumLength = 20; private Summarizer summarizer; /** Construct given a directory containing fetcher output. */ public FetchedSegments(FileSystem fs, String segmentsDir, Configuration conf) throws IOException { File[] segmentDirs = fs.listFiles(new File(segmentsDir)); - this.sumContext = conf.getInt("searcher.summary.context", 5); - this.sumLength = conf.getInt("searcher.summary.length", 20); - this.summarizer = new Summarizer(conf); + this.summarizer = new SummarizerFactory(conf).getSummarizer(); if (segmentDirs != null) { for (int i = 0; i < segmentDirs.length; i++) { @@ -158,9 +154,9 @@ public String getSummary(HitDetails details, Query query) throws IOException { + if (this.summarizer == null) { return ""; } String text = getSegment(details).getParseText(getUrl(details)).getText(); - - return this.summarizer.getSummary(text, query, this.sumContext, this.sumLength).toString(); + return this.summarizer.getSummary(text, query).toString(); } private class SummaryThread extends Thread { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=405165&r1=405164&r2=405165&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java Mon May 8 14:04:01 2006 @@ -13,319 +13,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.searcher; -import java.io.*; -import java.util.*; - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.searcher.Summary.*; -import org.apache.nutch.analysis.NutchDocumentAnalyzer; -import org.apache.nutch.util.NutchConfiguration; +// Hadoop imports +import org.apache.hadoop.conf.Configurable; -/** Implements hit summarization. */ -public class Summarizer { - - /** Converts text to tokens. */ - private Analyzer ANALYZER; - private Configuration conf; +// Nutch imports +import org.apache.nutch.plugin.Pluggable; - /** - * The constructor. - * @param conf - */ - public Summarizer(Configuration conf) { - this.conf = conf; - this.ANALYZER = new NutchDocumentAnalyzer(conf); - } +/** + * Extension point for summarizer. + * + * @author Jérôme Charron + */ +public interface Summarizer extends Configurable, Pluggable { + + /** The name of the extension point. */ + public final static String X_POINT_ID = Summarizer.class.getName(); + /** - * Class Excerpt represents a single passage found in the document, with some - * appropriate regions highlit. + * Get a summary for a specified text. + * @param text is the text to summarize. + * @param query is the query for which the text is a hit. */ - class Excerpt { - Vector passages = new Vector(); - SortedSet tokenSet = new TreeSet(); - int numTerms = 0; - - /** - */ - public Excerpt() { - } - - /** - */ - public void addToken(String token) { - tokenSet.add(token); - } - - /** - * Return how many unique toks we have - */ - public int numUniqueTokens() { - return tokenSet.size(); - } - - /** - * How many fragments we have. - */ - public int numFragments() { - return passages.size(); - } - - public void setNumTerms(int numTerms) { - this.numTerms = numTerms; - } - - public int getNumTerms() { - return numTerms; - } - - /** - * Add a frag to the list. - */ - public void add(Fragment fragment) { - passages.add(fragment); - } - - /** - * Return an Enum for all the fragments - */ - public Enumeration elements() { - return passages.elements(); - } - } - - /** Returns a summary for the given pre-tokenized text. */ - public Summary getSummary(String text, Query query, int sumContext, int sumLength) throws IOException { - - // Simplistic implementation. Finds the first fragments in the document - // containing any query terms. - // - // TODO: check that phrases in the query are matched in the fragment - - Token[] tokens = getTokens(text); // parse text to token array - - if (tokens.length == 0) - return new Summary(); - - String[] terms = query.getTerms(); - HashSet highlight = new HashSet(); // put query terms in table - for (int i = 0; i < terms.length; i++) - highlight.add(terms[i]); - - // - // Create a SortedSet that ranks excerpts according to - // how many query terms are present. An excerpt is - // a Vector full of Fragments and Highlights - // - SortedSet excerptSet = new TreeSet(new Comparator() { - public int compare(Object o1, Object o2) { - Excerpt excerpt1 = (Excerpt) o1; - Excerpt excerpt2 = (Excerpt) o2; - - if (excerpt1 == null && excerpt2 != null) { - return -1; - } else if (excerpt1 != null && excerpt2 == null) { - return 1; - } else if (excerpt1 == null && excerpt2 == null) { - return 0; - } - - int numToks1 = excerpt1.numUniqueTokens(); - int numToks2 = excerpt2.numUniqueTokens(); - - if (numToks1 < numToks2) { - return -1; - } else if (numToks1 == numToks2) { - return excerpt1.numFragments() - excerpt2.numFragments(); - } else { - return 1; - } - } - } - ); - - // - // Iterate through all terms in the document - // - int lastExcerptPos = 0; - for (int i = 0; i < tokens.length; i++) { - // - // If we find a term that's in the query... - // - if (highlight.contains(tokens[i].termText())) { - // - // Start searching at a point SUM_CONTEXT terms back, - // and move SUM_CONTEXT terms into the future. - // - int startToken = (i > sumContext) ? i - sumContext : 0; - int endToken = Math.min(i + sumContext, tokens.length); - int offset = tokens[startToken].startOffset(); - int j = startToken; - - // - // Iterate from the start point to the finish, adding - // terms all the way. The end of the passage is always - // SUM_CONTEXT beyond the last query-term. - // - Excerpt excerpt = new Excerpt(); - if (i != 0) { - excerpt.add(new Summary.Ellipsis()); - } - - // - // Iterate through as long as we're before the end of - // the document and we haven't hit the max-number-of-items - // -in-a-summary. - // - while ((j < endToken) && (j - startToken < sumLength)) { - // - // Now grab the hit-element, if present - // - Token t = tokens[j]; - if (highlight.contains(t.termText())) { - excerpt.addToken(t.termText()); - excerpt.add(new Fragment(text.substring(offset, t.startOffset()))); - excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset()))); - offset = t.endOffset(); - endToken = Math.min(j + sumContext, tokens.length); - } - - j++; - } - - lastExcerptPos = endToken; - - // - // We found the series of search-term hits and added - // them (with intervening text) to the excerpt. Now - // we need to add the trailing edge of text. - // - // So if (j < tokens.length) then there is still trailing - // text to add. (We haven't hit the end of the source doc.) - // Add the words since the last hit-term insert. - // - if (j < tokens.length) { - excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset()))); - } - - // - // Remember how many terms are in this excerpt - // - excerpt.setNumTerms(j - startToken); - - // - // Store the excerpt for later sorting - // - excerptSet.add(excerpt); - - // - // Start SUM_CONTEXT places away. The next - // search for relevant excerpts begins at i-SUM_CONTEXT - // - i = j + sumContext; - } - } - - // - // If the target text doesn't appear, then we just - // excerpt the first SUM_LENGTH words from the document. - // - if (excerptSet.size() == 0) { - Excerpt excerpt = new Excerpt(); - int excerptLen = Math.min(sumLength, tokens.length); - lastExcerptPos = excerptLen; - - excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset()))); - excerpt.setNumTerms(excerptLen); - excerptSet.add(excerpt); - } - - // - // Now choose the best items from the excerpt set. - // Stop when our Summary grows too large. - // - double tokenCount = 0; - Summary s = new Summary(); - while (tokenCount <= sumLength && excerptSet.size() > 0) { - Excerpt excerpt = (Excerpt) excerptSet.last(); - excerptSet.remove(excerpt); - - double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); - for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) { - Fragment f = (Fragment) e.nextElement(); - // Don't add fragments if it takes us over the max-limit - if (tokenCount + tokenFraction <= sumLength) { - s.add(f); - } - tokenCount += tokenFraction; - } - } - - if (tokenCount > 0 && lastExcerptPos < tokens.length) - s.add(new Ellipsis()); - return s; - } - - private Token[] getTokens(String text) throws IOException { - ArrayList result = new ArrayList(); - TokenStream ts = ANALYZER.tokenStream("content", new StringReader(text)); - for (Token token = ts.next(); token != null; token = ts.next()) { - result.add(token); - } - return (Token[])result.toArray(new Token[result.size()]); - } - - /** - * Tests Summary-generation. User inputs the name of a - * text file and a query string - */ - public static void main(String argv[]) throws IOException { - // Test arglist - if (argv.length < 2) { - System.out.println("Usage: java org.apache.nutch.searcher.Summarizer <textfile> <queryStr>"); - return; - } - - Summarizer s = new Summarizer(NutchConfiguration.create()); - - // - // Parse the args - // - File textFile = new File(argv[0]); - StringBuffer queryBuf = new StringBuffer(); - for (int i = 1; i < argv.length; i++) { - queryBuf.append(argv[i]); - queryBuf.append(" "); - } - - // - // Load the text file into a single string. - // - StringBuffer body = new StringBuffer(); - BufferedReader in = new BufferedReader(new FileReader(textFile)); - try { - System.out.println("About to read " + textFile + " from " + in); - String str = in.readLine(); - while (str != null) { - body.append(str); - str = in.readLine(); - } - } finally { - in.close(); - } + public Summary getSummary(String text, Query query); - Configuration conf = NutchConfiguration.create(); - int sumContext = conf.getInt("searcher.summary.context", 5); - int sumLength = conf.getInt("searcher.summary.length", 20); - // Convert the query string into a proper Query - Query query = Query.parse(queryBuf.toString(), conf); - System.out.println("Summary: '" + s.getSummary(body.toString(), query, sumContext, sumLength) + "'"); - } } Added: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java Mon May 8 14:04:01 2006 @@ -0,0 +1,69 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.searcher; + +// JDK imports +import java.util.logging.Logger; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.LogFormatter; + +// Nutch imports +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; + + +/** + * A factory for retrieving [EMAIL PROTECTED] Summarizer} extensions. + * + * @author Jérôme Charron + */ +public class SummarizerFactory { + + /** My logger */ + public final static Logger LOG = + LogFormatter.getLogger(SummarizerFactory.class.getName()); + + /** The first available [EMAIL PROTECTED] Summarizer} */ + private Summarizer summarizer = null; + + + public SummarizerFactory(Configuration conf) { + try { + Extension[] extensions = PluginRepository + .get(conf) + .getExtensionPoint(Summarizer.X_POINT_ID) + .getExtensions(); + summarizer = (Summarizer) extensions[0].getExtensionInstance(); + summarizer.setConf(conf); + LOG.info("Using the first summarizer extension found: " + + extensions[0].getId()); + } catch (Exception e) { + LOG.warning(e.toString()); + } + } + + /** + * Get the first available [EMAIL PROTECTED] Summarizer} extension. + * @return the first available [EMAIL PROTECTED] Summarizer} extension, or + * <code>null</code> if none available. + */ + public Summarizer getSummarizer() { + return summarizer; + } + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=405165&r1=405164&r2=405165&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Mon May 8 14:04:01 2006 @@ -50,6 +50,8 @@ <ant dir="query-more" target="deploy"/> <ant dir="query-site" target="deploy"/> <ant dir="query-url" target="deploy"/> + <ant dir="summary-basic" target="deploy"/> + <ant dir="summary-lucene" target="deploy"/> <ant dir="urlfilter-automaton" target="deploy"/> <ant dir="urlfilter-prefix" target="deploy"/> <ant dir="urlfilter-regex" target="deploy"/> @@ -126,6 +128,8 @@ <ant dir="query-more" target="clean"/> <ant dir="query-site" target="clean"/> <ant dir="query-url" target="clean"/> + <ant dir="summary-basic" target="clean"/> + <ant dir="summary-lucene" target="clean"/> <ant dir="urlfilter-automaton" target="clean"/> <ant dir="urlfilter-prefix" target="clean"/> <ant dir="urlfilter-regex" target="clean"/> Modified: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=405165&r1=405164&r2=405165&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Mon May 8 14:04:01 2006 @@ -45,4 +45,8 @@ id="org.apache.nutch.analysis.NutchAnalyzer" name="Nutch Analysis"/> +<extension-point + id="org.apache.nutch.searcher.Summarizer" + name="Nutch Summarizer"/> + </plugin> Added: lucene/nutch/trunk/src/plugin/summary-basic/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/build.xml?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-basic/build.xml (added) +++ lucene/nutch/trunk/src/plugin/summary-basic/build.xml Mon May 8 14:04:01 2006 @@ -0,0 +1,7 @@ +<?xml version="1.0"?> + +<project name="summary-basic" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Propchange: lucene/nutch/trunk/src/plugin/summary-basic/build.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml Mon May 8 14:04:01 2006 @@ -0,0 +1,28 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<plugin + id="summary-basic" + name="Basic Summarizer Plug-in" + version="1.0.0" + provider-name="org.apache.nutch"> + + <runtime> + <library name="summary-basic.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.summary.basic" + name="Basic Summarizer" + point="org.apache.nutch.searcher.Summarizer"> + + <implementation id="Basic Summarizer" + class="org.apache.nutch.summary.basic.BasicSummarizer"/> + + </extension> + +</plugin> Propchange: lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java (added) +++ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java Mon May 8 14:04:01 2006 @@ -0,0 +1,390 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.summary.basic; + +// JDK imports +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.Vector; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Lucene imports +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; + +// Nutch imports +import org.apache.nutch.analysis.NutchDocumentAnalyzer; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.Summarizer; +import org.apache.nutch.searcher.Summary; +import org.apache.nutch.searcher.Summary.Ellipsis; +import org.apache.nutch.searcher.Summary.Fragment; +import org.apache.nutch.searcher.Summary.Highlight; +import org.apache.nutch.util.NutchConfiguration; + + +/** Implements hit summarization. */ +public class BasicSummarizer implements Summarizer { + + private int sumContext = 5; + private int sumLength = 20; + private Analyzer analyzer = null; + private Configuration conf = null; + + + public BasicSummarizer() { } + + private BasicSummarizer(Configuration conf) { + setConf(conf); + } + + + /* ----------------------------- * + * <implementation:Configurable> * + * ----------------------------- */ + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + this.analyzer = new NutchDocumentAnalyzer(conf); + this.sumContext = conf.getInt("searcher.summary.context", 5); + this.sumLength = conf.getInt("searcher.summary.length", 20); + } + + /* ------------------------------ * + * </implementation:Configurable> * + * ------------------------------ */ + + + /* --------------------------- * + * <implementation:Summarizer> * + * --------------------------- */ + + public Summary getSummary(String text, Query query) { + + // Simplistic implementation. Finds the first fragments in the document + // containing any query terms. + // + // TODO: check that phrases in the query are matched in the fragment + + Token[] tokens = getTokens(text); // parse text to token array + + if (tokens.length == 0) + return new Summary(); + + String[] terms = query.getTerms(); + HashSet highlight = new HashSet(); // put query terms in table + for (int i = 0; i < terms.length; i++) + highlight.add(terms[i]); + + // + // Create a SortedSet that ranks excerpts according to + // how many query terms are present. An excerpt is + // a Vector full of Fragments and Highlights + // + SortedSet excerptSet = new TreeSet(new Comparator() { + public int compare(Object o1, Object o2) { + Excerpt excerpt1 = (Excerpt) o1; + Excerpt excerpt2 = (Excerpt) o2; + + if (excerpt1 == null && excerpt2 != null) { + return -1; + } else if (excerpt1 != null && excerpt2 == null) { + return 1; + } else if (excerpt1 == null && excerpt2 == null) { + return 0; + } + + int numToks1 = excerpt1.numUniqueTokens(); + int numToks2 = excerpt2.numUniqueTokens(); + + if (numToks1 < numToks2) { + return -1; + } else if (numToks1 == numToks2) { + return excerpt1.numFragments() - excerpt2.numFragments(); + } else { + return 1; + } + } + } + ); + + // + // Iterate through all terms in the document + // + int lastExcerptPos = 0; + for (int i = 0; i < tokens.length; i++) { + // + // If we find a term that's in the query... + // + if (highlight.contains(tokens[i].termText())) { + // + // Start searching at a point SUM_CONTEXT terms back, + // and move SUM_CONTEXT terms into the future. + // + int startToken = (i > sumContext) ? i - sumContext : 0; + int endToken = Math.min(i + sumContext, tokens.length); + int offset = tokens[startToken].startOffset(); + int j = startToken; + + // + // Iterate from the start point to the finish, adding + // terms all the way. The end of the passage is always + // SUM_CONTEXT beyond the last query-term. + // + Excerpt excerpt = new Excerpt(); + if (i != 0) { + excerpt.add(new Summary.Ellipsis()); + } + + // + // Iterate through as long as we're before the end of + // the document and we haven't hit the max-number-of-items + // -in-a-summary. + // + while ((j < endToken) && (j - startToken < sumLength)) { + // + // Now grab the hit-element, if present + // + Token t = tokens[j]; + if (highlight.contains(t.termText())) { + excerpt.addToken(t.termText()); + excerpt.add(new Fragment(text.substring(offset, t.startOffset()))); + excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset()))); + offset = t.endOffset(); + endToken = Math.min(j + sumContext, tokens.length); + } + + j++; + } + + lastExcerptPos = endToken; + + // + // We found the series of search-term hits and added + // them (with intervening text) to the excerpt. Now + // we need to add the trailing edge of text. + // + // So if (j < tokens.length) then there is still trailing + // text to add. (We haven't hit the end of the source doc.) + // Add the words since the last hit-term insert. + // + if (j < tokens.length) { + excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset()))); + } + + // + // Remember how many terms are in this excerpt + // + excerpt.setNumTerms(j - startToken); + + // + // Store the excerpt for later sorting + // + excerptSet.add(excerpt); + + // + // Start SUM_CONTEXT places away. The next + // search for relevant excerpts begins at i-SUM_CONTEXT + // + i = j + sumContext; + } + } + + // + // If the target text doesn't appear, then we just + // excerpt the first SUM_LENGTH words from the document. + // + if (excerptSet.size() == 0) { + Excerpt excerpt = new Excerpt(); + int excerptLen = Math.min(sumLength, tokens.length); + lastExcerptPos = excerptLen; + + excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset()))); + excerpt.setNumTerms(excerptLen); + excerptSet.add(excerpt); + } + + // + // Now choose the best items from the excerpt set. + // Stop when our Summary grows too large. + // + double tokenCount = 0; + Summary s = new Summary(); + while (tokenCount <= sumLength && excerptSet.size() > 0) { + Excerpt excerpt = (Excerpt) excerptSet.last(); + excerptSet.remove(excerpt); + + double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); + for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) { + Fragment f = (Fragment) e.nextElement(); + // Don't add fragments if it takes us over the max-limit + if (tokenCount + tokenFraction <= sumLength) { + s.add(f); + } + tokenCount += tokenFraction; + } + } + + if (tokenCount > 0 && lastExcerptPos < tokens.length) + s.add(new Ellipsis()); + return s; + } + + /* ---------------------------- * + * </implementation:Summarizer> * + * ---------------------------- */ + + + /** + * Class Excerpt represents a single passage found in the document, with some + * appropriate regions highlit. + */ + class Excerpt { + Vector passages = new Vector(); + SortedSet tokenSet = new TreeSet(); + int numTerms = 0; + + /** + */ + public Excerpt() { + } + + /** + */ + public void addToken(String token) { + tokenSet.add(token); + } + + /** + * Return how many unique toks we have + */ + public int numUniqueTokens() { + return tokenSet.size(); + } + + /** + * How many fragments we have. + */ + public int numFragments() { + return passages.size(); + } + + public void setNumTerms(int numTerms) { + this.numTerms = numTerms; + } + + public int getNumTerms() { + return numTerms; + } + + /** + * Add a frag to the list. + */ + public void add(Fragment fragment) { + passages.add(fragment); + } + + /** + * Return an Enum for all the fragments + */ + public Enumeration elements() { + return passages.elements(); + } + } + + + private Token[] getTokens(String text) { + ArrayList result = new ArrayList(); + TokenStream ts = analyzer.tokenStream("content", new StringReader(text)); + Token token = null; + while (true) { + try { + token = ts.next(); + } catch (IOException e) { + token = null; + } + if (token == null) { break; } + result.add(token); + } + try { + ts.close(); + } catch (IOException e) { + // ignore + } + return (Token[]) result.toArray(new Token[result.size()]); + } + + /** + * Tests Summary-generation. User inputs the name of a + * text file and a query string + */ + public static void main(String argv[]) throws IOException { + // Test arglist + if (argv.length < 2) { + System.out.println("Usage: java org.apache.nutch.searcher.Summarizer <textfile> <queryStr>"); + return; + } + + Configuration conf = NutchConfiguration.create(); + Summarizer s = new BasicSummarizer(conf); + + // + // Parse the args + // + File textFile = new File(argv[0]); + StringBuffer queryBuf = new StringBuffer(); + for (int i = 1; i < argv.length; i++) { + queryBuf.append(argv[i]); + queryBuf.append(" "); + } + + // + // Load the text file into a single string. + // + StringBuffer body = new StringBuffer(); + BufferedReader in = new BufferedReader(new FileReader(textFile)); + try { + System.out.println("About to read " + textFile + " from " + in); + String str = in.readLine(); + while (str != null) { + body.append(str); + str = in.readLine(); + } + } finally { + in.close(); + } + + // Convert the query string into a proper Query + Query query = Query.parse(queryBuf.toString(), conf); + System.out.println("Summary: '" + s.getSummary(body.toString(), query) + "'"); + } +} Propchange: lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html (added) +++ lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html Mon May 8 14:04:01 2006 @@ -0,0 +1,7 @@ +<html> +<body> +<p> +A basic summarizer implementation. +</p> +</body> +</html> Propchange: lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/summary-lucene/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/build.xml?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-lucene/build.xml (added) +++ lucene/nutch/trunk/src/plugin/summary-lucene/build.xml Mon May 8 14:04:01 2006 @@ -0,0 +1,7 @@ +<?xml version="1.0"?> + +<project name="summary-lucene" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/build.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar?rev=405165&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml Mon May 8 14:04:01 2006 @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<plugin + id="summary-lucene" + name="Lucene Highlighter Summary Plug-in" + version="1.0.0" + provider-name="org.apache.nutch"> + + <runtime> + <library name="summary-lucene.jar"> + <export name="*"/> + </library> + <library name="lucene-highlighter-2.0-rc1-dev.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.summary.basic" + name="Lucene Highlighter Summarizer" + point="org.apache.nutch.searcher.Summarizer"> + + <implementation id="Basic Summarizer" + class="org.apache.nutch.summary.lucene.LuceneSummarizer"/> + + </extension> + +</plugin> Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java (added) +++ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java Mon May 8 14:04:01 2006 @@ -0,0 +1,119 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.summary.lucene; + +// JDK imports +import java.io.StringReader; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Lucene imports +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.highlight.Highlighter; +import org.apache.lucene.search.highlight.QueryScorer; +import org.apache.lucene.search.highlight.SimpleHTMLFormatter; +import org.apache.lucene.search.highlight.WeightedTerm; + +// Nutch imports +import org.apache.nutch.analysis.NutchDocumentAnalyzer; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.Summarizer; +import org.apache.nutch.searcher.Summary; +import org.apache.nutch.searcher.Summary.Ellipsis; +import org.apache.nutch.searcher.Summary.Fragment; +import org.apache.nutch.searcher.Summary.Highlight; + + +/** Implements hit summarization. */ +public class LuceneSummarizer implements Summarizer { + + private final static String SEPARATOR = "###"; + private final static Formatter FORMATTER = + new SimpleHTMLFormatter(SEPARATOR, SEPARATOR); + + /** Converts text to tokens. */ + private Analyzer analyzer = null; + private Configuration conf = null; + + public LuceneSummarizer() { } + + private LuceneSummarizer(Configuration conf) { + setConf(conf); + } + + + /* ----------------------------- * + * <implementation:Configurable> * + * ----------------------------- */ + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + this.analyzer = new NutchDocumentAnalyzer(conf); + } + + /* ------------------------------ * + * </implementation:Configurable> * + * ------------------------------ */ + + + /* --------------------------- * + * <implementation:Summarizer> * + * --------------------------- */ + + public Summary getSummary(String text, Query query) { + + String[] terms = query.getTerms(); + WeightedTerm[] weighted = new WeightedTerm[terms.length]; + for (int i=0; i<terms.length; i++) { + weighted[i] = new WeightedTerm(1.0f, terms[i]); + } + Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted)); + TokenStream tokens = analyzer.tokenStream("content", new StringReader(text)); + Summary summary = new Summary(); + try { + // TODO : The max number of fragments (3) should be configurable + String[] result = highlighter.getBestFragments(tokens, text, 3); + for (int i=0; i<result.length; i++) { + String[] parts = result[i].split(SEPARATOR); + boolean highlight = false; + for (int j=0; j<parts.length; j++) { + if (highlight) { + summary.add(new Highlight(parts[j])); + } else { + summary.add(new Fragment(parts[j])); + } + highlight = !highlight; + } + summary.add(new Ellipsis()); + } + } catch (Exception e) { + // Nothing to do... + } + return summary; + } + + /* ---------------------------- * + * </implementation:Summarizer> * + * ---------------------------- */ + +} Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html?rev=405165&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html (added) +++ lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html Mon May 8 14:04:01 2006 @@ -0,0 +1,7 @@ +<html> +<body> +<p> +A Lucene Highlighter based summarizer implementation. +</p> +</body> +</html> Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html ------------------------------------------------------------------------------ svn:eol-style = native