Author: mattmann
Date: Sun May 17 23:57:44 2015
New Revision: 1679911

URL: http://svn.apache.org/r1679911
Log:
Re-apply NUTCH-1854 after mistakenly rolled back during NUTCH-1973.

Modified:
    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
    nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1679911&r1=1679910&r2=1679911&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Sun May 17 
23:57:44 2015
@@ -21,6 +21,8 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.segment.SegmentChecker;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
@@ -31,6 +33,7 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.util.*;
 import org.apache.hadoop.fs.Path;
 
@@ -197,6 +200,11 @@ public class ParseSegment extends NutchT
   }
 
   public void parse(Path segment) throws IOException {
+     if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
+         LOG.warn("Segment: " + segment
+         + " already parsed!! Skipped parsing this segment!!"); // NUTCH-1854
+          return;
+      }
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1679911&r1=1679910&r2=1679911&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Sun May 
17 23:57:44 2015
@@ -113,6 +113,19 @@ public class SegmentChecker {
 
       return false;
     }
+
+  }
+
+  /**
+   * Check the segment to see if it is has been parsed before.
+   */
+  public static boolean isParsed(Path segment, FileSystem fs)
+      throws IOException {
+
+      if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME))){
+       return true;
+      }
+      return false;
   }
 
 }
\ No newline at end of file


Reply via email to