Author: jerome
Date: Tue May 9 16:06:17 2006
New Revision: 405566
URL: http://svn.apache.org/viewcvs?rev=405566&view=rev
Log:
NUTCH-134 - No more needs for the clusterer to remove html tags from summaries
Modified:
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
Modified:
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java?rev=405566&r1=405565&r2=405566&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
(original)
+++
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
Tue May 9 16:06:17 2006
@@ -34,7 +34,6 @@
import com.dawidweiss.carrot.core.local.ProcessingException;
import com.dawidweiss.carrot.core.local.RequestContext;
import com.dawidweiss.carrot.core.local.clustering.*;
-import com.dawidweiss.carrot.util.common.StringUtils;
/**
* A local input component that ignores the query passed from the
@@ -103,7 +102,7 @@
// produce 'documents' for successor components.
final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
for (int i=0;i<summaries.length;i++) {
- consumer.addDocument(new NutchDocument(i, details[i],
htmlToText(summaries[i]), defaultLanguage));
+ consumer.addDocument(new NutchDocument(i, details[i], summaries[i],
defaultLanguage));
}
}
@@ -121,14 +120,4 @@
return SUCCESSOR_CAPABILITIES;
}
- /**
- * Converts a html chunk to plain text.
- *
- * This method is only required because Nutch's summaries are in HTML.
- * I guess it would be possible to get rid of the code below by
- * adding patches/ methods to Nutch that return plain text summaries.
- */
- private final String htmlToText(String html) {
- return StringUtils.removeMarkup(html);
- }
}