Author: snagel
Date: Sat Apr 18 20:41:13 2015
New Revision: 1674581

URL: http://svn.apache.org/r1674581
Log:
NUTCH-1854 bin/crawl fails with a parsing fetcher

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
    nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674581&r1=1674580&r2=1674581&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 18 20:41:13 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
+
 * NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro 
via mattmann)
 
 * NUTCH-1988 Make nested output directory dump optional (Michael Joyce via 
mattmann)
@@ -12,7 +14,7 @@ Nutch Current Development 1.10-SNAPSHOT
 
 * NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via 
mattmann)
 
-* NUTCH-1911 Imeprove DomainStatistics tool command line parsing (Michael 
Joyce via mattmann)
+* NUTCH-1911 Improve DomainStatistics tool command line parsing (Michael Joyce 
via mattmann)
 
 * NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel)
 

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1674581&r1=1674580&r2=1674581&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Sat Apr 18 
20:41:13 2015
@@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.segment.SegmentChecker;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
@@ -32,6 +33,7 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.util.*;
 import org.apache.hadoop.fs.Path;
 
@@ -198,6 +200,11 @@ public class ParseSegment extends Config
   }
 
   public void parse(Path segment) throws IOException {
+       if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
+         LOG.warn("Segment: " + segment
+                         + " already parsed!! Skipped parsing this 
segment!!"); // NUTCH-1854
+         return;
+       }
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1674581&r1=1674580&r2=1674581&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Sat Apr 
18 20:41:13 2015
@@ -115,4 +115,16 @@ public class SegmentChecker {
     }
   }
 
-}
\ No newline at end of file
+  /**
+   * Check the segment to see if it is has been parsed before.
+   */
+  public static boolean isParsed(Path segment, FileSystem fs)
+         throws IOException {
+
+       if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME)))
+         return true;
+       return false;
+
+  } 
+
+}


Reply via email to