Author: dogacan
Date: Wed Jan 21 11:41:55 2009
New Revision: 736388
URL: http://svn.apache.org/viewvc?rev=736388&view=rev
Log:
NUTCH-579 - Feed plugin only indexes one post per feed due to identical digest
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=736388&r1=736387&r2=736388&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jan 21 11:41:55 2009
@@ -319,6 +319,9 @@
119. NUTCH-676 - MapWritable is written inefficiently and confusingly.
(dogacan)
+
+120. NUTCH-579 - Feed plugin only indexes one post per feed due to identical
+ digest. (dogacan)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=736388&r1=736387&r2=736388&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java Wed
Jan 21 11:41:55 2009
@@ -33,6 +33,7 @@
public byte[] calculate(Content content, Parse parse) {
byte[] data = content.getContent();
if (data == null) data = content.getUrl().getBytes();
- return MD5Hash.digest(data).getDigest();
+ StringBuilder buf = new
StringBuilder().append(data).append(parse.getText());
+ return MD5Hash.digest(buf.toString().getBytes()).getDigest();
}
}