Author: markus
Date: Wed Jul 1 07:00:40 2015
New Revision: 1688566
URL: http://svn.apache.org/r1688566
Log:
NUTCH-1692 SegmentReader was broken in distributed mode
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688566&r1=1688565&r2=1688566&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 1 07:00:40 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-1692 SegmentReader was broken in distributed mode (markus, tejasp)
+
* NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus)
* NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse
filter (for filtering outlinks)
Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=1688566&r1=1688565&r2=1688566&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed Jul 1
07:00:40 2015
@@ -507,55 +507,64 @@ public class SegmentReader extends Confi
public void getStats(Path segment, final SegmentReaderStats stats)
throws Exception {
- SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(
- getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
long cnt = 0L;
Text key = new Text();
- for (int i = 0; i < readers.length; i++) {
- while (readers[i].next(key))
- cnt++;
- readers[i].close();
- }
- stats.generated = cnt;
- Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
- if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
- cnt = 0L;
- long start = Long.MAX_VALUE;
- long end = Long.MIN_VALUE;
- CrawlDatum value = new CrawlDatum();
- MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir,
- getConf());
- for (int i = 0; i < mreaders.length; i++) {
- while (mreaders[i].next(key, value)) {
+
+ if (ge) {
+ SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(
+ getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
+ for (int i = 0; i < readers.length; i++) {
+ while (readers[i].next(key))
cnt++;
- if (value.getFetchTime() < start)
- start = value.getFetchTime();
- if (value.getFetchTime() > end)
- end = value.getFetchTime();
+ readers[i].close();
+ }
+ stats.generated = cnt;
+ }
+
+ if (fe) {
+ Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
+ if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
+ cnt = 0L;
+ long start = Long.MAX_VALUE;
+ long end = Long.MIN_VALUE;
+ CrawlDatum value = new CrawlDatum();
+ MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs,
fetchDir,
+ getConf());
+ for (int i = 0; i < mreaders.length; i++) {
+ while (mreaders[i].next(key, value)) {
+ cnt++;
+ if (value.getFetchTime() < start)
+ start = value.getFetchTime();
+ if (value.getFetchTime() > end)
+ end = value.getFetchTime();
+ }
+ mreaders[i].close();
}
- mreaders[i].close();
+ stats.start = start;
+ stats.end = end;
+ stats.fetched = cnt;
}
- stats.start = start;
- stats.end = end;
- stats.fetched = cnt;
}
- Path parseDir = new Path(segment, ParseData.DIR_NAME);
- if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDir()) {
- cnt = 0L;
- long errors = 0L;
- ParseData value = new ParseData();
- MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir,
- getConf());
- for (int i = 0; i < mreaders.length; i++) {
- while (mreaders[i].next(key, value)) {
- cnt++;
- if (!value.getStatus().isSuccess())
- errors++;
+
+ if (pd) {
+ Path parseDir = new Path(segment, ParseData.DIR_NAME);
+ if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDir()) {
+ cnt = 0L;
+ long errors = 0L;
+ ParseData value = new ParseData();
+ MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs,
parseDir,
+ getConf());
+ for (int i = 0; i < mreaders.length; i++) {
+ while (mreaders[i].next(key, value)) {
+ cnt++;
+ if (!value.getStatus().isSuccess())
+ errors++;
+ }
+ mreaders[i].close();
}
- mreaders[i].close();
+ stats.parsed = cnt;
+ stats.parseErrors = errors;
}
- stats.parsed = cnt;
- stats.parseErrors = errors;
}
}