Author: snagel
Date: Sat Apr 18 20:41:13 2015
New Revision: 1674581
URL: http://svn.apache.org/r1674581
Log:
NUTCH-1854 bin/crawl fails with a parsing fetcher
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674581&r1=1674580&r2=1674581&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 18 20:41:13 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
+
* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro
via mattmann)
* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via
mattmann)
@@ -12,7 +14,7 @@ Nutch Current Development 1.10-SNAPSHOT
* NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via
mattmann)
-* NUTCH-1911 Imeprove DomainStatistics tool command line parsing (Michael
Joyce via mattmann)
+* NUTCH-1911 Improve DomainStatistics tool command line parsing (Michael Joyce
via mattmann)
* NUTCH-1981 Upgrade to icu4j 55.1 (Marko Asplund via snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1674581&r1=1674580&r2=1674581&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Sat Apr 18
20:41:13 2015
@@ -22,6 +22,7 @@ import org.slf4j.LoggerFactory;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.segment.SegmentChecker;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
@@ -32,6 +33,7 @@ import org.apache.nutch.net.protocols.Re
import org.apache.nutch.protocol.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.util.*;
import org.apache.hadoop.fs.Path;
@@ -198,6 +200,11 @@ public class ParseSegment extends Config
}
public void parse(Path segment) throws IOException {
+ if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
+ LOG.warn("Segment: " + segment
+ + " already parsed!! Skipped parsing this
segment!!"); // NUTCH-1854
+ return;
+ }
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1674581&r1=1674580&r2=1674581&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Sat Apr
18 20:41:13 2015
@@ -115,4 +115,16 @@ public class SegmentChecker {
}
}
-}
\ No newline at end of file
+ /**
+ * Check the segment to see if it is has been parsed before.
+ */
+ public static boolean isParsed(Path segment, FileSystem fs)
+ throws IOException {
+
+ if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME)))
+ return true;
+ return false;
+
+ }
+
+}