Author: dogacan Date: Wed Jan 21 11:41:55 2009 New Revision: 736388 URL: http://svn.apache.org/viewvc?rev=736388&view=rev Log: NUTCH-579 - Feed plugin only indexes one post per feed due to identical digest
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=736388&r1=736387&r2=736388&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Jan 21 11:41:55 2009 @@ -319,6 +319,9 @@ 119. NUTCH-676 - MapWritable is written inefficiently and confusingly. (dogacan) + +120. NUTCH-579 - Feed plugin only indexes one post per feed due to identical + digest. (dogacan) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=736388&r1=736387&r2=736388&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java Wed Jan 21 11:41:55 2009 @@ -33,6 +33,7 @@ public byte[] calculate(Content content, Parse parse) { byte[] data = content.getContent(); if (data == null) data = content.getUrl().getBytes(); - return MD5Hash.digest(data).getDigest(); + StringBuilder buf = new StringBuilder().append(data).append(parse.getText()); + return MD5Hash.digest(buf.toString().getBytes()).getDigest(); } }