Author: markus
Date: Fri Jul 5 08:52:51 2013
New Revision: 1499948
URL: http://svn.apache.org/r1499948
Log:
NUTCH-1520 SegmentMerger looses records
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499948&r1=1499947&r2=1499948&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jul 5 08:52:51 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1520 SegmentMerger looses records (markus)
+
* NUTCH-1602 improve the readability of metadata in readdb dump normal (lufeng)
* NUTCH-1596 HeadingsParseFilter not thread safe (snagel via markus)
Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1499948&r1=1499947&r2=1499948&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Fri Jul 5
08:52:51 2013
@@ -412,10 +412,14 @@ public class SegmentMerger extends Confi
lastF = val;
lastFname = sp.segmentName;
} else {
- // take newer
- if (lastFname.compareTo(sp.segmentName) < 0) {
- lastF = val;
- lastFname = sp.segmentName;
+ // only consider fetch status
+ // https://issues.apache.org/jira/browse/NUTCH-1520
+ if (CrawlDatum.hasFetchStatus(val)) {
+ // take newer
+ if (lastFname.compareTo(sp.segmentName) < 0) {
+ lastF = val;
+ lastFname = sp.segmentName;
+ }
}
}
} else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) {
@@ -480,7 +484,7 @@ public class SegmentMerger extends Confi
linked.isEmpty() ? null :
linked.lastEntry().getValue())){
return;
}
-
+
curCount++;
String sliceName = null;
MetaWrapper wrapper = new MetaWrapper();