Author: dogacan
Date: Sun Jun 17 13:27:17 2007
New Revision: 548103

URL: http://svn.apache.org/viewvc?view=rev&rev=548103
Log:
NUTCH-485 - Change HtmlParseFilter 's to return ParseResult object instead of 
Parse object.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java
    
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
    
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
    
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
    
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548103&r1=548102&r2=548103
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Jun 17 13:27:17 2007
@@ -39,6 +39,8 @@
     datums. This patch addresses that issue. Now, if Fetcher gets a null 
content, 
     instead of pushing an empty content, it filters the null content.
     
+13. NUTCH-485 - Change HtmlParseFilter 's to return ParseResult object instead 
of Parse object. (Gal Nitzan via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java?view=diff&rev=548103&r1=548102&r2=548103
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java Sun 
Jun 17 13:27:17 2007
@@ -38,5 +38,5 @@
 
   /** Adds metadata or otherwise modifies a parse of HTML content, given
    * the DOM tree of a page. */
-  Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc);
+  ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags 
metaTags, DocumentFragment doc);
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?view=diff&rev=548103&r1=548102&r2=548103
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java 
Sun Jun 17 13:27:17 2007
@@ -59,18 +59,23 @@
   /** Run all defined filters. */
   public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
 
-    ParseResult filteredParseResult = new ParseResult(content.getUrl());
-    
-    for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
-      Parse parse = entry.getValue();
-      for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
-        parse = this.htmlParseFilters[i].filter(content, parse, metaTags, doc);
-        if (!parse.getData().getStatus().isSuccess()) break;
+    // loop on each filter
+    for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
+      // call filter interface
+      parseResult =
+        htmlParseFilters[i].filter(content, parseResult, metaTags, doc);
+
+      // any failure on parse obj, return
+      if (!parseResult.isSuccess()) {
+        // TODO: What happens when parseResult.isEmpty() ?
+        // Maybe clone parseResult and use parseResult as backup...
+
+        // remove failed parse before return
+        parseResult.filter();
+        return parseResult;
       }
-      filteredParseResult.put(entry.getKey(), 
-                              new ParseText(parse.getText()), parse.getData());
     }
 
-    return filteredParseResult;
+    return parseResult;
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java?view=diff&rev=548103&r1=548102&r2=548103
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java Sun Jun 
17 13:27:17 2007
@@ -139,4 +139,18 @@
     }
       
   }
+
+  /**
+   * A convenience method which returns true only if all parses are successful.
+   * Parse success is determined by [EMAIL PROTECTED] ParseStatus#isSuccess()}
+   */
+  public boolean isSuccess() {
+    for(Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+      Entry<Text, Parse> entry = i.next();
+      if (!entry.getValue().getData().getStatus().isSuccess()) {
+        return false;
+      }
+    }
+    return true;
+  }
 }

Modified: 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?view=diff&rev=548103&r1=548102&r2=548103
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
 Sun Jun 17 13:27:17 2007
@@ -22,6 +22,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -263,24 +264,35 @@
 
   /** Adds metadata or otherwise modifies a parse of an HTML document, given
    * the DOM tree of a page. */
-  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc) {
+  public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    // get parse obj
+    Parse parse = parseResult.get(content.getUrl());
 
     // construct base url
     URL base;
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      return new ParseStatus(e).getEmptyParse(getConf());
+      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
+      parseResult.put(content.getUrl(), 
+                      new ParseText(emptyParse.getText()), 
+                      emptyParse.getData());
+      return parseResult;
     }
 
     try {
       // extract license metadata
       Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
     } catch (ParseException e) {
-      return new ParseStatus(e).getEmptyParse(getConf());
+      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
+      parseResult.put(content.getUrl(), 
+                      new ParseText(emptyParse.getText()), 
+                      emptyParse.getData());
+      return parseResult;
     }
 
-    return parse;
+    return parseResult;
   }
 
   public void setConf(Configuration conf) {

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?view=diff&rev=548103&r1=548102&r2=548103
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 Sun Jun 17 13:27:17 2007
@@ -30,11 +30,13 @@
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.protocol.Content;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 
 // DOM imports
 import org.w3c.dom.DocumentFragment;
@@ -84,8 +86,10 @@
    * <li>3. meta http-equiv (content-language) 
(http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
    * <br>Only the first occurence of language is stored.
    */
-  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc) {
+  public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
     
+    Parse parse = parseResult.get(content.getUrl());
+
     // Trying to find the document's language
     LanguageParser parser = new LanguageParser(doc);
     String lang = parser.getLanguage();
@@ -93,7 +97,7 @@
     if (lang != null) {
       parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
     }
-    return parse;
+    return parseResult;
   }
 
   static class LanguageParser {

Modified: 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?view=diff&rev=548103&r1=548102&r2=548103
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
 Sun Jun 17 13:27:17 2007
@@ -35,12 +35,14 @@
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.StringUtil;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 
 
 /**
@@ -63,8 +65,11 @@
   /**
    * Scan the HTML document looking at possible rel-tags
    */
-  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc) {
+  public ParseResult filter(Content content, ParseResult parseResult,
+    HTMLMetaTags metaTags, DocumentFragment doc) {
     
+    // get parse obj
+    Parse parse = parseResult.get(content.getUrl());
     // Trying to find the document's rel-tags
     Parser parser = new Parser(doc);
     Set tags = parser.getRelTags();
@@ -73,7 +78,7 @@
     while (iter.hasNext()) {
       metadata.add(REL_TAG, (String) iter.next());
     }
-    return parse;
+    return parseResult;
   }
 
   private static class Parser {

Modified: 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=548103&r1=548102&r2=548103
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 Sun Jun 17 13:27:17 2007
@@ -36,11 +36,13 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 import org.apache.oro.text.regex.MatchResult;
 import org.apache.oro.text.regex.Pattern;
 import org.apache.oro.text.regex.PatternCompiler;
@@ -69,7 +71,11 @@
 
   private Configuration conf;
   
-  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, 
DocumentFragment doc) {
+  public ParseResult filter(Content content, ParseResult parseResult,
+    HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
     String url = content.getBaseUrl();
     ArrayList outlinks = new ArrayList();
     walk(doc, parse, metaTags, url, outlinks);
@@ -85,9 +91,11 @@
                                           parse.getData().getContentMeta(),
                                           parse.getData().getParseMeta());
       parseData.setConf(this.conf);
-      parse = new ParseImpl(text, parseData);
+
+      // replace original parse obj with new one
+      parseResult.put(content.getUrl(), new ParseText(text), parseData);
     }
-    return parse;
+    return parseResult;
   }
   
   private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, 
List outlinks) {


Reply via email to