Author: mattmann
Date: Sun Jun 17 10:19:14 2007
New Revision: 548076

URL: http://svn.apache.org/viewvc?view=rev&rev=548076
Log:
- fix for NUTCH-443 (contributed by Dogacan)

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548076&r1=548075&r2=548076
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Jun 17 10:19:14 2007
@@ -32,6 +32,13 @@
 
 11. NUTCH-495 - Unnecessary delays in Fetcher2 (dogacan)
 
+12. NUTCH-443 - allow parsers to return multiple Parse object, this will speed 
+    up the rss parser (dogacan via mattmann). This update is a fix and 
semantics
+    change from the original patch for NUTCH-443. The original patch did not 
tell
+    the  Indexer to read crawl_parse too so that it can pickup sub-urls' fetch 
+    datums. This patch addresses that issue. Now, if Fetcher gets a null 
content, 
+    instead of pushing an empty content, it filters the null content.
+    
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=548076&r1=548075&r2=548076
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sun Jun 
17 10:19:14 2007
@@ -288,78 +288,75 @@
       datum.setFetchTime(System.currentTimeMillis());
       if (pstatus != null) 
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
 
-      if (content == null) {
-        String url = key.toString();
-        content = new Content(url, url, new byte[0], "", new Metadata(), 
this.conf);
-      }
-      Metadata metadata = content.getMetadata();
-      // add segment to metadata
-      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
-      // add score to content metadata so that ParseSegment can pick it up.
-      try {
-        scfilters.passScoreBeforeParsing(key, datum, content);
-      } catch (Exception e) {
-        if (LOG.isWarnEnabled()) {
-          e.printStackTrace(LogUtil.getWarnStream(LOG));
-          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
-        }
-      }
-
-      /* Note: Fetcher will only follow meta-redirects coming from the
-       * original URL. */ 
       ParseResult parseResult = null;
-      if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+      if (content != null) {
+        Metadata metadata = content.getMetadata();
+        // add segment to metadata
+        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
+        // add score to content metadata so that ParseSegment can pick it up.
         try {
-          parseResult = this.parseUtil.parse(content);
+          scfilters.passScoreBeforeParsing(key, datum, content);
         } catch (Exception e) {
-          LOG.warn("Error parsing: " + key + ": " + 
StringUtils.stringifyException(e));
+          if (LOG.isWarnEnabled()) {
+            e.printStackTrace(LogUtil.getWarnStream(LOG));
+            LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+          }
         }
+        /* Note: Fetcher will only follow meta-redirects coming from the
+         * original URL. */ 
+        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+          try {
+            parseResult = this.parseUtil.parse(content);
+          } catch (Exception e) {
+            LOG.warn("Error parsing: " + key + ": " + 
StringUtils.stringifyException(e));
+          }
 
-        if (parseResult != null) {
-          for (Entry<Text, Parse> entry : parseResult) {
-            Text url = entry.getKey();
-            Parse parse = entry.getValue();
-            ParseStatus parseStatus = parse.getData().getStatus();
-            
-            if (!parseStatus.isSuccess()) {
-              LOG.warn("Error parsing: " + key + ": " + parseStatus);
-              parse = parseStatus.getEmptyParse(getConf());
-            }
+          if (parseResult != null) {
+            for (Entry<Text, Parse> entry : parseResult) {
+              Text url = entry.getKey();
+              Parse parse = entry.getValue();
+              ParseStatus parseStatus = parse.getData().getStatus();
+              
+              if (!parseStatus.isSuccess()) {
+                LOG.warn("Error parsing: " + key + ": " + parseStatus);
+                parse = parseStatus.getEmptyParse(getConf());
+              }
 
-            // Calculate page signature. For non-parsing fetchers this will
-            // be done in ParseSegment
-            byte[] signature = 
-              SignatureFactory.getSignature(getConf()).calculate(content, 
parse);
-            // Ensure segment name and score are in parseData metadata
-            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
-                segmentName);
-            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
-                StringUtil.toHexString(signature));
-            // Pass fetch time to content meta
-            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
-                Long.toString(datum.getFetchTime()));
-            if (url.equals(key))
-              datum.setSignature(signature);
-            try {
-              scfilters.passScoreAfterParsing(url, content, parse);
-            } catch (Exception e) {
-              if (LOG.isWarnEnabled()) {
-                e.printStackTrace(LogUtil.getWarnStream(LOG));
-                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+              // Calculate page signature. For non-parsing fetchers this will
+              // be done in ParseSegment
+              byte[] signature = 
+                SignatureFactory.getSignature(getConf()).calculate(content, 
parse);
+              // Ensure segment name and score are in parseData metadata
+              parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
+                  segmentName);
+              parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
+                  StringUtil.toHexString(signature));
+              // Pass fetch time to content meta
+              parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+                  Long.toString(datum.getFetchTime()));
+              if (url.equals(key))
+                datum.setSignature(signature);
+              try {
+                scfilters.passScoreAfterParsing(url, content, parse);
+              } catch (Exception e) {
+                if (LOG.isWarnEnabled()) {
+                  e.printStackTrace(LogUtil.getWarnStream(LOG));
+                  LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+                }
               }
             }
+          } else {
+            byte[] signature = 
+              SignatureFactory.getSignature(getConf()).calculate(content, 
+                  new ParseStatus().getEmptyParse(conf));
+            datum.setSignature(signature);
           }
-        } else {
-          byte[] signature = 
-            SignatureFactory.getSignature(getConf()).calculate(content, 
-                new ParseStatus().getEmptyParse(conf));
-          datum.setSignature(signature);
         }
       }
 
       try {
         output.collect(key, new ObjectWritable(datum));
-        if (storingContent)
+        if (content != null && storingContent)
           output.collect(key, new ObjectWritable(content));
         if (parseResult != null) {
           for (Entry<Text, Parse> entry : parseResult) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=548076&r1=548075&r2=548076
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sun Jun 
17 10:19:14 2007
@@ -662,72 +662,69 @@
       datum.setFetchTime(System.currentTimeMillis());
       if (pstatus != null) 
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
 
-      if (content == null) {
-        String url = key.toString();
-        content = new Content(url, url, new byte[0], "", new Metadata(), 
this.conf);
-      }
-      Metadata metadata = content.getMetadata();
-      // add segment to metadata
-      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
-      // add score to content metadata so that ParseSegment can pick it up.
-      try {
-        scfilters.passScoreBeforeParsing(key, datum, content);
-      } catch (Exception e) {
-        if (LOG.isWarnEnabled()) {
-          e.printStackTrace(LogUtil.getWarnStream(LOG));
-          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
-        }
-      }
-
-      /* Note: Fetcher will only follow meta-redirects coming from the
-       * original URL. */ 
       ParseResult parseResult = null;
-      if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+      if (content != null) {
+        Metadata metadata = content.getMetadata();
+        // add segment to metadata
+        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
+        // add score to content metadata so that ParseSegment can pick it up.
         try {
-          parseResult = this.parseUtil.parse(content);
+          scfilters.passScoreBeforeParsing(key, datum, content);
         } catch (Exception e) {
-          LOG.warn("Error parsing: " + key + ": " + 
StringUtils.stringifyException(e));
+          if (LOG.isWarnEnabled()) {
+            e.printStackTrace(LogUtil.getWarnStream(LOG));
+            LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+          }
         }
+        /* Note: Fetcher will only follow meta-redirects coming from the
+         * original URL. */ 
+        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+          try {
+            parseResult = this.parseUtil.parse(content);
+          } catch (Exception e) {
+            LOG.warn("Error parsing: " + key + ": " + 
StringUtils.stringifyException(e));
+          }
 
-        if (parseResult != null) {
-          for (Entry<Text, Parse> entry : parseResult) {
-            Text url = entry.getKey();
-            Parse parse = entry.getValue();
-            ParseStatus parseStatus = parse.getData().getStatus();
+          if (parseResult != null) {
+            for (Entry<Text, Parse> entry : parseResult) {
+              Text url = entry.getKey();
+              Parse parse = entry.getValue();
+              ParseStatus parseStatus = parse.getData().getStatus();
 
-            if (!parseStatus.isSuccess()) {
-              LOG.warn("Error parsing: " + key + ": " + parseStatus);
-              parse = parseStatus.getEmptyParse(getConf());
-            }
+              if (!parseStatus.isSuccess()) {
+                LOG.warn("Error parsing: " + key + ": " + parseStatus);
+                parse = parseStatus.getEmptyParse(getConf());
+              }
 
-            // Calculate page signature. For non-parsing fetchers this will
-            // be done in ParseSegment
-            byte[] signature = 
-              SignatureFactory.getSignature(getConf()).calculate(content, 
parse);
-            // Ensure segment name and score are in parseData metadata
-            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
-                segmentName);
-            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
-                StringUtil.toHexString(signature));
-            // Pass fetch time to content meta
-            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
-                Long.toString(datum.getFetchTime()));
-            if (url.equals(key))
-              datum.setSignature(signature);
-            try {
-              scfilters.passScoreAfterParsing(url, content, parse);
-            } catch (Exception e) {
-              if (LOG.isWarnEnabled()) {
-                e.printStackTrace(LogUtil.getWarnStream(LOG));
-                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+              // Calculate page signature. For non-parsing fetchers this will
+              // be done in ParseSegment
+              byte[] signature = 
+                SignatureFactory.getSignature(getConf()).calculate(content, 
parse);
+              // Ensure segment name and score are in parseData metadata
+              parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
+                  segmentName);
+              parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
+                  StringUtil.toHexString(signature));
+              // Pass fetch time to content meta
+              parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+                  Long.toString(datum.getFetchTime()));
+              if (url.equals(key))
+                datum.setSignature(signature);
+              try {
+                scfilters.passScoreAfterParsing(url, content, parse);
+              } catch (Exception e) {
+                if (LOG.isWarnEnabled()) {
+                  e.printStackTrace(LogUtil.getWarnStream(LOG));
+                  LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+                }
               }
             }
+          } else {
+            byte[] signature = 
+              SignatureFactory.getSignature(getConf()).calculate(content, 
+                  new ParseStatus().getEmptyParse(conf));
+            datum.setSignature(signature);
           }
-        } else {
-          byte[] signature = 
-            SignatureFactory.getSignature(getConf()).calculate(content, 
-                new ParseStatus().getEmptyParse(conf));
-          datum.setSignature(signature);
         }
       }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=548076&r1=548075&r2=548076
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Sun Jun 
17 10:19:14 2007
@@ -153,7 +153,6 @@
     Inlinks inlinks = null;
     CrawlDatum dbDatum = null;
     CrawlDatum fetchDatum = null;
-    CrawlDatum redir = null;
     ParseData parseData = null;
     ParseText parseText = null;
     while (values.hasNext()) {
@@ -168,11 +167,12 @@
           // don't index unmodified (empty) pages
           if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED)
             fetchDatum = datum;
-        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
-          // redirected page
-          redir = datum;
-        else
+        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
+                   CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) {
+          continue;
+        } else {
           throw new RuntimeException("Unexpected status: "+datum.getStatus());
+        }
       } else if (value instanceof ParseData) {
         parseData = (ParseData)value;
       } else if (value instanceof ParseText) {
@@ -181,11 +181,6 @@
         LOG.warn("Unrecognized type: "+value.getClass());
       }
     }      
-    if (redir != null) {
-      // XXX page was redirected - what should we do?
-      // XXX discard it for now
-      return;
-    }
 
     if (fetchDatum == null || dbDatum == null
         || parseText == null || parseData == null) {
@@ -260,6 +255,7 @@
         LOG.info("Indexer: adding segment: " + segments[i]);
       }
       job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
+      job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME));
       job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
       job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
     }


Reply via email to