Author: markus Date: Fri Jul 5 08:52:51 2013 New Revision: 1499948 URL: http://svn.apache.org/r1499948 Log: NUTCH-1520 SegmentMerger looses records
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499948&r1=1499947&r2=1499948&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jul 5 08:52:51 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1520 SegmentMerger looses records (markus) + * NUTCH-1602 improve the readability of metadata in readdb dump normal (lufeng) * NUTCH-1596 HeadingsParseFilter not thread safe (snagel via markus) Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1499948&r1=1499947&r2=1499948&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Fri Jul 5 08:52:51 2013 @@ -412,10 +412,14 @@ public class SegmentMerger extends Confi lastF = val; lastFname = sp.segmentName; } else { - // take newer - if (lastFname.compareTo(sp.segmentName) < 0) { - lastF = val; - lastFname = sp.segmentName; + // only consider fetch status + // https://issues.apache.org/jira/browse/NUTCH-1520 + if (CrawlDatum.hasFetchStatus(val)) { + // take newer + if (lastFname.compareTo(sp.segmentName) < 0) { + lastF = val; + lastFname = sp.segmentName; + } } } } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) { @@ -480,7 +484,7 @@ public class SegmentMerger extends Confi linked.isEmpty() ? null : linked.lastEntry().getValue())){ return; } - + curCount++; String sliceName = null; MetaWrapper wrapper = new MetaWrapper();