Author: mattmann
Date: Sun May 17 23:57:44 2015
New Revision: 1679911
URL: http://svn.apache.org/r1679911
Log:
Re-apply NUTCH-1854 after mistakenly rolled back during NUTCH-1973.
Modified:
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1679911&r1=1679910&r2=1679911&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Sun May 17
23:57:44 2015
@@ -21,6 +21,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.segment.SegmentChecker;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
@@ -31,6 +33,7 @@ import org.apache.nutch.net.protocols.Re
import org.apache.nutch.protocol.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.util.*;
import org.apache.hadoop.fs.Path;
@@ -197,6 +200,11 @@ public class ParseSegment extends NutchT
}
public void parse(Path segment) throws IOException {
+ if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
+ LOG.warn("Segment: " + segment
+ + " already parsed!! Skipped parsing this segment!!"); // NUTCH-1854
+ return;
+ }
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1679911&r1=1679910&r2=1679911&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Sun May
17 23:57:44 2015
@@ -113,6 +113,19 @@ public class SegmentChecker {
return false;
}
+
+ }
+
+ /**
+ * Check the segment to see if it is has been parsed before.
+ */
+ public static boolean isParsed(Path segment, FileSystem fs)
+ throws IOException {
+
+ if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME))){
+ return true;
+ }
+ return false;
}
}
\ No newline at end of file