summary-basic...

jerome Mon, 08 May 2006 14:04:30 -0700

Author: jerome
Date: Mon May  8 14:04:01 2006
New Revision: 405165

URL: http://svn.apache.org/viewcvs?rev=405165&view=rev
Log:
NUTCH-134 : Added a summarizer extension point and two enxtensions:
* summary-basic is the current nutch implementation moved into a plugin
* summary-lucene a raw version of a summarizer plugin based on lucene 
highlighter


Added:
    
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java   
(with props)
    lucene/nutch/trunk/src/plugin/summary-basic/
    lucene/nutch/trunk/src/plugin/summary-basic/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/summary-basic/src/
    lucene/nutch/trunk/src/plugin/summary-basic/src/java/
    lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/
    lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/
    
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/
    
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/
    
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
   (with props)
    
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html
   (with props)
    lucene/nutch/trunk/src/plugin/summary-lucene/
    lucene/nutch/trunk/src/plugin/summary-lucene/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/summary-lucene/lib/
    
lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar
   (with props)
    lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/summary-lucene/src/
    lucene/nutch/trunk/src/plugin/summary-lucene/src/java/
    lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/
    lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/
    
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/
    
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/
    
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java
   (with props)
    
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html
   (with props)
Modified:
    lucene/nutch/trunk/build.xml
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/default.properties
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
    lucene/nutch/trunk/src/plugin/build.xml
    lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=405165&r1=405164&r2=405165&view=diff
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Mon May  8 14:04:01 2006
@@ -323,6 +323,8 @@
       <packageset dir="${plugins.dir}/query-more/src/java"/>
       <packageset dir="${plugins.dir}/query-site/src/java"/>
       <packageset dir="${plugins.dir}/query-url/src/java"/>
+      <packageset dir="${plugins.dir}/summary-basic/src/java"/>
+      <packageset dir="${plugins.dir}/summary-lucene/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
@@ -350,6 +352,7 @@
       <group title="Analysis Plugins" packages="${plugins.analysis}"/>
       <group title="Indexing Filter Plugins" packages="${plugins.index}"/>
       <group title="Query Filter Plugins" packages="${plugins.query}"/>
+      <group title="Summary Plugins" packages="${plugins.summary}"/>
       <group title="Clustering Plugins" packages="${plugins.clustering}"/>
       <group title="Ontology Plugins" packages="${plugins.ontology}"/>
       <group title="Misc. Plugins" packages="${plugins.misc}"/>

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=405165&r1=405164&r2=405165&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon May  8 14:04:01 2006
@@ -564,7 +564,7 @@
 
 <property>
   <name>plugin.includes</name>
-  
<value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)</value>
+  
<value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By

Modified: lucene/nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=405165&r1=405164&r2=405165&view=diff
==============================================================================
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Mon May  8 14:04:01 2006
@@ -126,6 +126,12 @@
 plugins.clustering=\
    org.apache.nutch.clustering.carrot2*
 
+#
+# Summary Plugins
+#
+plugins.summary=\
+   org.apache.nutch.summary.basic*:\
+   org.apache.nutch.summary.lucene*
 
 #
 # Misc. Plugins

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=405165&r1=405164&r2=405165&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java 
Mon May  8 14:04:01 2006
@@ -110,16 +110,12 @@
   }
 
   private HashMap segments = new HashMap();
-  private int sumContext = 5;
-  private int sumLength = 20;
   private Summarizer summarizer;
 
   /** Construct given a directory containing fetcher output. */
   public FetchedSegments(FileSystem fs, String segmentsDir, Configuration 
conf) throws IOException {
     File[] segmentDirs = fs.listFiles(new File(segmentsDir));
-    this.sumContext = conf.getInt("searcher.summary.context", 5);
-    this.sumLength = conf.getInt("searcher.summary.length", 20);
-    this.summarizer = new Summarizer(conf);
+    this.summarizer = new SummarizerFactory(conf).getSummarizer();
 
     if (segmentDirs != null) {
         for (int i = 0; i < segmentDirs.length; i++) {
@@ -158,9 +154,9 @@
   public String getSummary(HitDetails details, Query query)
     throws IOException {
 
+    if (this.summarizer == null) { return ""; }
     String text = getSegment(details).getParseText(getUrl(details)).getText();
-
-    return this.summarizer.getSummary(text, query, this.sumContext, 
this.sumLength).toString();
+    return this.summarizer.getSummary(text, query).toString();
   }
     
   private class SummaryThread extends Thread {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=405165&r1=405164&r2=405165&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java Mon 
May  8 14:04:01 2006
@@ -13,319 +13,30 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.searcher;
 
-import java.io.*;
-import java.util.*;
-
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.searcher.Summary.*;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.util.NutchConfiguration;
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
 
-/** Implements hit summarization. */
-public class Summarizer {
-   
-  /** Converts text to tokens. */
-  private Analyzer ANALYZER;
-  private Configuration conf;
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
 
-  /**
-   * The constructor.
-   * @param conf
-   */
-  public Summarizer(Configuration conf) {
-    this.conf = conf;
-    this.ANALYZER = new NutchDocumentAnalyzer(conf);
-  }
 
+/** 
+ * Extension point for summarizer.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface Summarizer extends Configurable, Pluggable {
+
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = Summarizer.class.getName();
+  
   /**
-   * Class Excerpt represents a single passage found in the document, with some
-   * appropriate regions highlit.
+   * Get a summary for a specified text.
+   * @param text is the text to summarize.
+   * @param query is the query for which the text is a hit.
    */
-  class Excerpt {
-      Vector passages = new Vector();
-      SortedSet tokenSet = new TreeSet();
-      int numTerms = 0;
-
-      /**
-       */
-      public Excerpt() {
-      }
-      
-      /**
-       */
-      public void addToken(String token) {
-          tokenSet.add(token);
-      }
-
-      /**
-       * Return how many unique toks we have
-       */
-      public int numUniqueTokens() {
-          return tokenSet.size();
-      }
-
-      /**
-       * How many fragments we have.
-       */
-      public int numFragments() {
-          return passages.size();
-      }
-
-      public void setNumTerms(int numTerms) {
-          this.numTerms = numTerms;
-      }
-
-      public int getNumTerms() {
-          return numTerms;
-      }
-
-      /**
-       * Add a frag to the list.
-       */
-      public void add(Fragment fragment) {
-          passages.add(fragment);
-      }
-
-      /**
-       * Return an Enum for all the fragments
-       */
-      public Enumeration elements() {
-          return passages.elements();
-      }
-  }
-
-  /** Returns a summary for the given pre-tokenized text. */
-  public Summary getSummary(String text, Query query, int sumContext, int 
sumLength) throws IOException {
-
-    // Simplistic implementation.  Finds the first fragments in the document
-    // containing any query terms.
-    //
-    // TODO: check that phrases in the query are matched in the fragment
-
-    Token[] tokens = getTokens(text);             // parse text to token array
-
-    if (tokens.length == 0)
-      return new Summary();
-
-    String[] terms = query.getTerms();
-    HashSet highlight = new HashSet();            // put query terms in table
-    for (int i = 0; i < terms.length; i++)
-      highlight.add(terms[i]);
-
-    //
-    // Create a SortedSet that ranks excerpts according to
-    // how many query terms are present.  An excerpt is
-    // a Vector full of Fragments and Highlights
-    //
-    SortedSet excerptSet = new TreeSet(new Comparator() {
-        public int compare(Object o1, Object o2) {
-            Excerpt excerpt1 = (Excerpt) o1;
-            Excerpt excerpt2 = (Excerpt) o2;
-
-            if (excerpt1 == null && excerpt2 != null) {
-                return -1;
-            } else if (excerpt1 != null && excerpt2 == null) {
-                return 1;
-            } else if (excerpt1 == null && excerpt2 == null) {
-                return 0;
-            }
-
-            int numToks1 = excerpt1.numUniqueTokens();
-            int numToks2 = excerpt2.numUniqueTokens();
-
-            if (numToks1 < numToks2) {
-                return -1;
-            } else if (numToks1 == numToks2) {
-                return excerpt1.numFragments() - excerpt2.numFragments();
-            } else {
-                return 1;
-            }
-        }
-    }
-        );
-
-    //
-    // Iterate through all terms in the document
-    //
-    int lastExcerptPos = 0;
-    for (int i = 0; i < tokens.length; i++) {
-      //
-      // If we find a term that's in the query...
-      //
-      if (highlight.contains(tokens[i].termText())) {
-        //
-        // Start searching at a point SUM_CONTEXT terms back,
-        // and move SUM_CONTEXT terms into the future.
-        //
-        int startToken = (i > sumContext) ? i - sumContext : 0;
-        int endToken = Math.min(i + sumContext, tokens.length);
-        int offset = tokens[startToken].startOffset();
-        int j = startToken;
-
-        //
-        // Iterate from the start point to the finish, adding
-        // terms all the way.  The end of the passage is always
-        // SUM_CONTEXT beyond the last query-term.
-        //
-        Excerpt excerpt = new Excerpt();
-        if (i != 0) {
-            excerpt.add(new Summary.Ellipsis());
-        }
-
-        //
-        // Iterate through as long as we're before the end of
-        // the document and we haven't hit the max-number-of-items
-        // -in-a-summary.
-        //
-        while ((j < endToken) && (j - startToken < sumLength)) {
-          //
-          // Now grab the hit-element, if present
-          //
-          Token t = tokens[j];
-          if (highlight.contains(t.termText())) {
-            excerpt.addToken(t.termText());
-            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
-            excerpt.add(new 
Highlight(text.substring(t.startOffset(),t.endOffset())));
-            offset = t.endOffset();
-            endToken = Math.min(j + sumContext, tokens.length);
-          }
-
-          j++;
-        }
-
-        lastExcerptPos = endToken;
-
-        //
-        // We found the series of search-term hits and added
-        // them (with intervening text) to the excerpt.  Now 
-        // we need to add the trailing edge of text.
-        //
-        // So if (j < tokens.length) then there is still trailing
-        // text to add.  (We haven't hit the end of the source doc.)
-        // Add the words since the last hit-term insert.
-        //
-        if (j < tokens.length) {
-          excerpt.add(new 
Fragment(text.substring(offset,tokens[j].endOffset())));
-        }
-
-        //
-        // Remember how many terms are in this excerpt
-        //
-        excerpt.setNumTerms(j - startToken);
-
-        //
-        // Store the excerpt for later sorting
-        //
-        excerptSet.add(excerpt);
-
-        //
-        // Start SUM_CONTEXT places away.  The next
-        // search for relevant excerpts begins at i-SUM_CONTEXT
-        //
-        i = j + sumContext;
-      }
-    }
-
-    //
-    // If the target text doesn't appear, then we just
-    // excerpt the first SUM_LENGTH words from the document.
-    //
-    if (excerptSet.size() == 0) {
-        Excerpt excerpt = new Excerpt();
-        int excerptLen = Math.min(sumLength, tokens.length);
-        lastExcerptPos = excerptLen;
-
-        excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), 
tokens[excerptLen-1].startOffset())));
-        excerpt.setNumTerms(excerptLen);
-        excerptSet.add(excerpt);
-    }
-
-    //
-    // Now choose the best items from the excerpt set.
-    // Stop when our Summary grows too large.
-    //
-    double tokenCount = 0;
-    Summary s = new Summary();
-    while (tokenCount <= sumLength && excerptSet.size() > 0) {
-        Excerpt excerpt = (Excerpt) excerptSet.last();
-        excerptSet.remove(excerpt);
-
-        double tokenFraction = (1.0 * excerpt.getNumTerms()) / 
excerpt.numFragments();
-        for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
-            Fragment f = (Fragment) e.nextElement();
-            // Don't add fragments if it takes us over the max-limit
-            if (tokenCount + tokenFraction <= sumLength) {
-                s.add(f);
-            }
-            tokenCount += tokenFraction;
-        }
-    }
-    
-    if (tokenCount > 0 && lastExcerptPos < tokens.length)
-      s.add(new Ellipsis());
-    return s;
-  }
-
-  private Token[] getTokens(String text) throws IOException {
-    ArrayList result = new ArrayList();
-    TokenStream ts = ANALYZER.tokenStream("content", new StringReader(text));
-    for (Token token = ts.next(); token != null; token = ts.next()) {
-      result.add(token);
-    }
-    return (Token[])result.toArray(new Token[result.size()]);
-  }
-
-    /**
-     * Tests Summary-generation.  User inputs the name of a 
-     * text file and a query string
-     */
-    public static void main(String argv[]) throws IOException {
-        // Test arglist
-        if (argv.length < 2) {
-            System.out.println("Usage: java 
org.apache.nutch.searcher.Summarizer <textfile> <queryStr>");
-            return;
-        }
-
-        Summarizer s = new Summarizer(NutchConfiguration.create());
-
-        //
-        // Parse the args
-        //
-        File textFile = new File(argv[0]);
-        StringBuffer queryBuf = new StringBuffer();
-        for (int i = 1; i < argv.length; i++) {
-            queryBuf.append(argv[i]);
-            queryBuf.append(" ");
-        }
-
-        //
-        // Load the text file into a single string.
-        //
-        StringBuffer body = new StringBuffer();
-        BufferedReader in = new BufferedReader(new FileReader(textFile));
-        try {
-            System.out.println("About to read " + textFile + " from " + in);
-            String str = in.readLine();
-            while (str != null) {
-                body.append(str);
-                str = in.readLine();
-            }
-        } finally {
-            in.close();
-        }
+  public Summary getSummary(String text, Query query);
 
-        Configuration conf = NutchConfiguration.create();
-        int sumContext = conf.getInt("searcher.summary.context", 5);
-        int sumLength = conf.getInt("searcher.summary.length", 20);
-        // Convert the query string into a proper Query
-        Query query = Query.parse(queryBuf.toString(), conf);
-        System.out.println("Summary: '" + s.getSummary(body.toString(), query, 
sumContext, sumLength) + "'");
-    }
 }

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java?rev=405165&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java 
(added)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java 
Mon May  8 14:04:01 2006
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.searcher;
+
+// JDK imports
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+
+/**
+ * A factory for retrieving [EMAIL PROTECTED] Summarizer} extensions.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class SummarizerFactory {
+
+  /** My logger */
+  public final static Logger LOG =
+    LogFormatter.getLogger(SummarizerFactory.class.getName());
+
+  /** The first available [EMAIL PROTECTED] Summarizer} */
+  private Summarizer summarizer = null;
+  
+  
+  public SummarizerFactory(Configuration conf) {
+    try {
+      Extension[] extensions = PluginRepository
+                                    .get(conf)
+                                    .getExtensionPoint(Summarizer.X_POINT_ID)
+                                    .getExtensions();
+      summarizer = (Summarizer) extensions[0].getExtensionInstance();
+      summarizer.setConf(conf);
+      LOG.info("Using the first summarizer extension found: " +
+               extensions[0].getId());
+    } catch (Exception e) {
+      LOG.warning(e.toString());
+    }
+  }
+
+  /**
+   * Get the first available [EMAIL PROTECTED] Summarizer} extension.
+   * @return the first available [EMAIL PROTECTED] Summarizer} extension, or
+   *         <code>null</code> if none available.
+   */
+  public Summarizer getSummarizer() {
+    return summarizer;
+  }
+
+} 

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SummarizerFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=405165&r1=405164&r2=405165&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon May  8 14:04:01 2006
@@ -50,6 +50,8 @@
      <ant dir="query-more" target="deploy"/>
      <ant dir="query-site" target="deploy"/>
      <ant dir="query-url" target="deploy"/>
+     <ant dir="summary-basic" target="deploy"/>
+     <ant dir="summary-lucene" target="deploy"/>
      <ant dir="urlfilter-automaton" target="deploy"/>
      <ant dir="urlfilter-prefix" target="deploy"/>
      <ant dir="urlfilter-regex" target="deploy"/>
@@ -126,6 +128,8 @@
     <ant dir="query-more" target="clean"/>
     <ant dir="query-site" target="clean"/>
     <ant dir="query-url" target="clean"/>
+    <ant dir="summary-basic" target="clean"/>
+    <ant dir="summary-lucene" target="clean"/>
     <ant dir="urlfilter-automaton" target="clean"/>
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>

Modified: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=405165&r1=405164&r2=405165&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Mon May  8 
14:04:01 2006
@@ -45,4 +45,8 @@
       id="org.apache.nutch.analysis.NutchAnalyzer"
       name="Nutch Analysis"/>
 
+<extension-point
+      id="org.apache.nutch.searcher.Summarizer"
+      name="Nutch Summarizer"/>
+
 </plugin>

Added: lucene/nutch/trunk/src/plugin/summary-basic/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/build.xml?rev=405165&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/summary-basic/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/summary-basic/build.xml Mon May  8 14:04:01 
2006
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="summary-basic" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/summary-basic/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml?rev=405165&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml Mon May  8 14:04:01 
2006
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<plugin
+   id="summary-basic"
+   name="Basic Summarizer Plug-in"
+   version="1.0.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+      <library name="summary-basic.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.summary.basic"
+              name="Basic Summarizer"
+              point="org.apache.nutch.searcher.Summarizer">
+
+      <implementation id="Basic Summarizer"
+                      class="org.apache.nutch.summary.basic.BasicSummarizer"/>
+
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/summary-basic/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java?rev=405165&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
 Mon May  8 14:04:01 2006
@@ -0,0 +1,390 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.summary.basic;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.Vector;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Lucene imports
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+
+// Nutch imports
+import org.apache.nutch.analysis.NutchDocumentAnalyzer;
+import org.apache.nutch.searcher.Query;
+import org.apache.nutch.searcher.Summarizer;
+import org.apache.nutch.searcher.Summary;
+import org.apache.nutch.searcher.Summary.Ellipsis;
+import org.apache.nutch.searcher.Summary.Fragment;
+import org.apache.nutch.searcher.Summary.Highlight;
+import org.apache.nutch.util.NutchConfiguration;
+
+
+/** Implements hit summarization. */
+public class BasicSummarizer implements Summarizer {
+  
+  private int sumContext = 5;
+  private int sumLength = 20;
+  private Analyzer analyzer = null;
+  private Configuration conf = null;
+  
+  
+  public BasicSummarizer() { }
+  
+  private BasicSummarizer(Configuration conf) {
+    setConf(conf);
+  }
+  
+  
+  /* ----------------------------- *
+   * <implementation:Configurable> *
+   * ----------------------------- */
+  
+  public Configuration getConf() {
+    return conf;
+  }
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.analyzer = new NutchDocumentAnalyzer(conf);
+    this.sumContext = conf.getInt("searcher.summary.context", 5);
+    this.sumLength = conf.getInt("searcher.summary.length", 20);
+  }
+  
+  /* ------------------------------ *
+   * </implementation:Configurable> *
+   * ------------------------------ */
+  
+  
+  /* --------------------------- *
+   * <implementation:Summarizer> *
+   * --------------------------- */
+  
+  public Summary getSummary(String text, Query query) {
+    
+    // Simplistic implementation.  Finds the first fragments in the document
+    // containing any query terms.
+    //
+    // TODO: check that phrases in the query are matched in the fragment
+    
+    Token[] tokens = getTokens(text);             // parse text to token array
+    
+    if (tokens.length == 0)
+      return new Summary();
+    
+    String[] terms = query.getTerms();
+    HashSet highlight = new HashSet();            // put query terms in table
+    for (int i = 0; i < terms.length; i++)
+      highlight.add(terms[i]);
+    
+    //
+    // Create a SortedSet that ranks excerpts according to
+    // how many query terms are present.  An excerpt is
+    // a Vector full of Fragments and Highlights
+    //
+    SortedSet excerptSet = new TreeSet(new Comparator() {
+      public int compare(Object o1, Object o2) {
+        Excerpt excerpt1 = (Excerpt) o1;
+        Excerpt excerpt2 = (Excerpt) o2;
+        
+        if (excerpt1 == null && excerpt2 != null) {
+          return -1;
+        } else if (excerpt1 != null && excerpt2 == null) {
+          return 1;
+        } else if (excerpt1 == null && excerpt2 == null) {
+          return 0;
+        }
+        
+        int numToks1 = excerpt1.numUniqueTokens();
+        int numToks2 = excerpt2.numUniqueTokens();
+        
+        if (numToks1 < numToks2) {
+          return -1;
+        } else if (numToks1 == numToks2) {
+          return excerpt1.numFragments() - excerpt2.numFragments();
+        } else {
+          return 1;
+        }
+      }
+    }
+    );
+    
+    //
+    // Iterate through all terms in the document
+    //
+    int lastExcerptPos = 0;
+    for (int i = 0; i < tokens.length; i++) {
+      //
+      // If we find a term that's in the query...
+      //
+      if (highlight.contains(tokens[i].termText())) {
+        //
+        // Start searching at a point SUM_CONTEXT terms back,
+        // and move SUM_CONTEXT terms into the future.
+        //
+        int startToken = (i > sumContext) ? i - sumContext : 0;
+        int endToken = Math.min(i + sumContext, tokens.length);
+        int offset = tokens[startToken].startOffset();
+        int j = startToken;
+        
+        //
+        // Iterate from the start point to the finish, adding
+        // terms all the way.  The end of the passage is always
+        // SUM_CONTEXT beyond the last query-term.
+        //
+        Excerpt excerpt = new Excerpt();
+        if (i != 0) {
+          excerpt.add(new Summary.Ellipsis());
+        }
+        
+        //
+        // Iterate through as long as we're before the end of
+        // the document and we haven't hit the max-number-of-items
+        // -in-a-summary.
+        //
+        while ((j < endToken) && (j - startToken < sumLength)) {
+          //
+          // Now grab the hit-element, if present
+          //
+          Token t = tokens[j];
+          if (highlight.contains(t.termText())) {
+            excerpt.addToken(t.termText());
+            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
+            excerpt.add(new 
Highlight(text.substring(t.startOffset(),t.endOffset())));
+            offset = t.endOffset();
+            endToken = Math.min(j + sumContext, tokens.length);
+          }
+          
+          j++;
+        }
+        
+        lastExcerptPos = endToken;
+        
+        //
+        // We found the series of search-term hits and added
+        // them (with intervening text) to the excerpt.  Now
+        // we need to add the trailing edge of text.
+        //
+        // So if (j < tokens.length) then there is still trailing
+        // text to add.  (We haven't hit the end of the source doc.)
+        // Add the words since the last hit-term insert.
+        //
+        if (j < tokens.length) {
+          excerpt.add(new 
Fragment(text.substring(offset,tokens[j].endOffset())));
+        }
+        
+        //
+        // Remember how many terms are in this excerpt
+        //
+        excerpt.setNumTerms(j - startToken);
+        
+        //
+        // Store the excerpt for later sorting
+        //
+        excerptSet.add(excerpt);
+        
+        //
+        // Start SUM_CONTEXT places away.  The next
+        // search for relevant excerpts begins at i-SUM_CONTEXT
+        //
+        i = j + sumContext;
+      }
+    }
+    
+    //
+    // If the target text doesn't appear, then we just
+    // excerpt the first SUM_LENGTH words from the document.
+    //
+    if (excerptSet.size() == 0) {
+      Excerpt excerpt = new Excerpt();
+      int excerptLen = Math.min(sumLength, tokens.length);
+      lastExcerptPos = excerptLen;
+      
+      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), 
tokens[excerptLen-1].startOffset())));
+      excerpt.setNumTerms(excerptLen);
+      excerptSet.add(excerpt);
+    }
+    
+    //
+    // Now choose the best items from the excerpt set.
+    // Stop when our Summary grows too large.
+    //
+    double tokenCount = 0;
+    Summary s = new Summary();
+    while (tokenCount <= sumLength && excerptSet.size() > 0) {
+      Excerpt excerpt = (Excerpt) excerptSet.last();
+      excerptSet.remove(excerpt);
+      
+      double tokenFraction = (1.0 * excerpt.getNumTerms()) / 
excerpt.numFragments();
+      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
+        Fragment f = (Fragment) e.nextElement();
+        // Don't add fragments if it takes us over the max-limit
+        if (tokenCount + tokenFraction <= sumLength) {
+          s.add(f);
+        }
+        tokenCount += tokenFraction;
+      }
+    }
+    
+    if (tokenCount > 0 && lastExcerptPos < tokens.length)
+      s.add(new Ellipsis());
+    return s;
+  }
+  
+  /* ---------------------------- *
+   * </implementation:Summarizer> *
+   * ---------------------------- */
+  
+  
+  /**
+   * Class Excerpt represents a single passage found in the document, with some
+   * appropriate regions highlit.
+   */
+  class Excerpt {
+    Vector passages = new Vector();
+    SortedSet tokenSet = new TreeSet();
+    int numTerms = 0;
+    
+    /**
+     */
+    public Excerpt() {
+    }
+    
+    /**
+     */
+    public void addToken(String token) {
+      tokenSet.add(token);
+    }
+    
+    /**
+     * Return how many unique toks we have
+     */
+    public int numUniqueTokens() {
+      return tokenSet.size();
+    }
+    
+    /**
+     * How many fragments we have.
+     */
+    public int numFragments() {
+      return passages.size();
+    }
+    
+    public void setNumTerms(int numTerms) {
+      this.numTerms = numTerms;
+    }
+    
+    public int getNumTerms() {
+      return numTerms;
+    }
+    
+    /**
+     * Add a frag to the list.
+     */
+    public void add(Fragment fragment) {
+      passages.add(fragment);
+    }
+    
+    /**
+     * Return an Enum for all the fragments
+     */
+    public Enumeration elements() {
+      return passages.elements();
+    }
+  }
+  
+  
+  private Token[] getTokens(String text) {
+    ArrayList result = new ArrayList();
+    TokenStream ts = analyzer.tokenStream("content", new StringReader(text));
+    Token token = null;
+    while (true) {
+      try {
+        token = ts.next();
+      } catch (IOException e) {
+        token = null;
+      }
+      if (token == null) { break; }
+      result.add(token);
+    }
+    try {
+      ts.close();
+    } catch (IOException e) {
+      // ignore
+    }
+    return (Token[]) result.toArray(new Token[result.size()]);
+  }
+  
+  /**
+   * Tests Summary-generation.  User inputs the name of a
+   * text file and a query string
+   */
+  public static void main(String argv[]) throws IOException {
+    // Test arglist
+    if (argv.length < 2) {
+      System.out.println("Usage: java org.apache.nutch.searcher.Summarizer 
<textfile> <queryStr>");
+      return;
+    }
+    
+    Configuration conf = NutchConfiguration.create();
+    Summarizer s = new BasicSummarizer(conf);
+    
+    //
+    // Parse the args
+    //
+    File textFile = new File(argv[0]);
+    StringBuffer queryBuf = new StringBuffer();
+    for (int i = 1; i < argv.length; i++) {
+      queryBuf.append(argv[i]);
+      queryBuf.append(" ");
+    }
+    
+    //
+    // Load the text file into a single string.
+    //
+    StringBuffer body = new StringBuffer();
+    BufferedReader in = new BufferedReader(new FileReader(textFile));
+    try {
+      System.out.println("About to read " + textFile + " from " + in);
+      String str = in.readLine();
+      while (str != null) {
+        body.append(str);
+        str = in.readLine();
+      }
+    } finally {
+      in.close();
+    }
+    
+    // Convert the query string into a proper Query
+    Query query = Query.parse(queryBuf.toString(), conf);
+    System.out.println("Summary: '" + s.getSummary(body.toString(), query) + 
"'");
+  }
+}

Propchange: 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html?rev=405165&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html
 (added)
+++ 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html
 Mon May  8 14:04:01 2006
@@ -0,0 +1,7 @@
+<html>
+<body>
+<p>
+A basic summarizer implementation.
+</p>
+</body>
+</html>

Propchange: 
lucene/nutch/trunk/src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/summary-lucene/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/build.xml?rev=405165&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/summary-lucene/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/summary-lucene/build.xml Mon May  8 14:04:01 
2006
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="summary-lucene" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar?rev=405165&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml?rev=405165&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml Mon May  8 14:04:01 
2006
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<plugin
+   id="summary-lucene"
+   name="Lucene Highlighter Summary Plug-in"
+   version="1.0.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+      <library name="summary-lucene.jar">
+         <export name="*"/>
+      </library>
+      <library name="lucene-highlighter-2.0-rc1-dev.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.summary.basic"
+              name="Lucene Highlighter Summarizer"
+              point="org.apache.nutch.searcher.Summarizer">
+
+      <implementation id="Basic Summarizer"
+                      
class="org.apache.nutch.summary.lucene.LuceneSummarizer"/>
+
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java?rev=405165&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java
 Mon May  8 14:04:01 2006
@@ -0,0 +1,119 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.summary.lucene;
+
+// JDK imports
+import java.io.StringReader;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Lucene imports
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.search.highlight.Formatter;
+import org.apache.lucene.search.highlight.Highlighter;
+import org.apache.lucene.search.highlight.QueryScorer;
+import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
+import org.apache.lucene.search.highlight.WeightedTerm;
+
+// Nutch imports
+import org.apache.nutch.analysis.NutchDocumentAnalyzer;
+import org.apache.nutch.searcher.Query;
+import org.apache.nutch.searcher.Summarizer;
+import org.apache.nutch.searcher.Summary;
+import org.apache.nutch.searcher.Summary.Ellipsis;
+import org.apache.nutch.searcher.Summary.Fragment;
+import org.apache.nutch.searcher.Summary.Highlight;
+
+
+/** Implements hit summarization. */
+public class LuceneSummarizer implements Summarizer {
+  
+  private final static String SEPARATOR = "###";
+  private final static Formatter FORMATTER =
+          new SimpleHTMLFormatter(SEPARATOR, SEPARATOR);
+
+  /** Converts text to tokens. */
+  private Analyzer analyzer = null;
+  private Configuration conf = null;
+  
+  public LuceneSummarizer() { }
+  
+  private LuceneSummarizer(Configuration conf) {
+    setConf(conf);
+  }
+  
+  
+  /* ----------------------------- *
+   * <implementation:Configurable> *
+   * ----------------------------- */
+  
+  public Configuration getConf() {
+    return conf;
+  }
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.analyzer = new NutchDocumentAnalyzer(conf);
+  }
+  
+  /* ------------------------------ *
+   * </implementation:Configurable> *
+   * ------------------------------ */
+  
+  
+  /* --------------------------- *
+   * <implementation:Summarizer> *
+   * --------------------------- */
+  
+  public Summary getSummary(String text, Query query) {
+
+    String[] terms = query.getTerms();
+    WeightedTerm[] weighted = new WeightedTerm[terms.length];
+    for (int i=0; i<terms.length; i++) {
+      weighted[i] = new WeightedTerm(1.0f, terms[i]);
+    }
+    Highlighter highlighter = new Highlighter(FORMATTER, new 
QueryScorer(weighted));
+    TokenStream tokens = analyzer.tokenStream("content", new 
StringReader(text));
+    Summary summary = new Summary();
+    try {
+      // TODO : The max number of fragments (3) should be configurable
+      String[] result = highlighter.getBestFragments(tokens, text, 3);
+      for (int i=0; i<result.length; i++) {
+        String[] parts = result[i].split(SEPARATOR);
+        boolean highlight = false;
+        for (int j=0; j<parts.length; j++) {
+          if (highlight) {
+            summary.add(new Highlight(parts[j]));
+          } else {
+            summary.add(new Fragment(parts[j]));
+          }
+          highlight = !highlight;
+        }
+        summary.add(new Ellipsis());
+      }
+    } catch (Exception e) {
+      // Nothing to do...
+    }
+    return summary;
+  }
+
+  /* ---------------------------- *
+   * </implementation:Summarizer> *
+   * ---------------------------- */
+  
+}

Propchange: 
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html?rev=405165&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html
 (added)
+++ 
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html
 Mon May  8 14:04:01 2006
@@ -0,0 +1,7 @@
+<html>
+<body>
+<p>
+A Lucene Highlighter based summarizer implementation.
+</p>
+</body>
+</html>

Propchange: 
lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

svn commit: r405165 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/searcher/ src/plugin/ src/plugin/nutch-extensionpoints/ src/plugin/summary-basic/ src/plugin/summary-basic/src/ src/plugin/summary-basic/src/java/ src/plugin/summary-basic...

Reply via email to