Author: markus Date: Wed Jul 1 07:00:40 2015 New Revision: 1688566 URL: http://svn.apache.org/r1688566 Log: NUTCH-1692 SegmentReader was broken in distributed mode
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688566&r1=1688565&r2=1688566&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jul 1 07:00:40 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-1692 SegmentReader was broken in distributed mode (markus, tejasp) + * NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus) * NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse filter (for filtering outlinks) Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=1688566&r1=1688565&r2=1688566&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed Jul 1 07:00:40 2015 @@ -507,55 +507,64 @@ public class SegmentReader extends Confi public void getStats(Path segment, final SegmentReaderStats stats) throws Exception { - SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders( - getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); long cnt = 0L; Text key = new Text(); - for (int i = 0; i < readers.length; i++) { - while (readers[i].next(key)) - cnt++; - readers[i].close(); - } - stats.generated = cnt; - Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); - if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { - cnt = 0L; - long start = Long.MAX_VALUE; - long end = Long.MIN_VALUE; - CrawlDatum value = new CrawlDatum(); - MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, - getConf()); - for (int i = 0; i < mreaders.length; i++) { - while (mreaders[i].next(key, value)) { + + if (ge) { + SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders( + getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); + for (int i = 0; i < readers.length; i++) { + while (readers[i].next(key)) cnt++; - if (value.getFetchTime() < start) - start = value.getFetchTime(); - if (value.getFetchTime() > end) - end = value.getFetchTime(); + readers[i].close(); + } + stats.generated = cnt; + } + + if (fe) { + Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); + if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { + cnt = 0L; + long start = Long.MAX_VALUE; + long end = Long.MIN_VALUE; + CrawlDatum value = new CrawlDatum(); + MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, + getConf()); + for (int i = 0; i < mreaders.length; i++) { + while (mreaders[i].next(key, value)) { + cnt++; + if (value.getFetchTime() < start) + start = value.getFetchTime(); + if (value.getFetchTime() > end) + end = value.getFetchTime(); + } + mreaders[i].close(); } - mreaders[i].close(); + stats.start = start; + stats.end = end; + stats.fetched = cnt; } - stats.start = start; - stats.end = end; - stats.fetched = cnt; } - Path parseDir = new Path(segment, ParseData.DIR_NAME); - if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDir()) { - cnt = 0L; - long errors = 0L; - ParseData value = new ParseData(); - MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, - getConf()); - for (int i = 0; i < mreaders.length; i++) { - while (mreaders[i].next(key, value)) { - cnt++; - if (!value.getStatus().isSuccess()) - errors++; + + if (pd) { + Path parseDir = new Path(segment, ParseData.DIR_NAME); + if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDir()) { + cnt = 0L; + long errors = 0L; + ParseData value = new ParseData(); + MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, + getConf()); + for (int i = 0; i < mreaders.length; i++) { + while (mreaders[i].next(key, value)) { + cnt++; + if (!value.getStatus().isSuccess()) + errors++; + } + mreaders[i].close(); } - mreaders[i].close(); + stats.parsed = cnt; + stats.parseErrors = errors; } - stats.parsed = cnt; - stats.parseErrors = errors; } }