Author: jerome
Date: Tue May  9 16:06:17 2006
New Revision: 405566

URL: http://svn.apache.org/viewcvs?rev=405566&view=rev
Log:
NUTCH-134 - No more needs for the clusterer to remove html tags from summaries

Modified:
    
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java

Modified: 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java?rev=405566&r1=405565&r2=405566&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
 Tue May  9 16:06:17 2006
@@ -34,7 +34,6 @@
 import com.dawidweiss.carrot.core.local.ProcessingException;
 import com.dawidweiss.carrot.core.local.RequestContext;
 import com.dawidweiss.carrot.core.local.clustering.*;
-import com.dawidweiss.carrot.util.common.StringUtils;
 
 /**
  * A local input component that ignores the query passed from the
@@ -103,7 +102,7 @@
     // produce 'documents' for successor components.
     final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
     for (int i=0;i<summaries.length;i++) {
-      consumer.addDocument(new NutchDocument(i, details[i], 
htmlToText(summaries[i]), defaultLanguage));
+      consumer.addDocument(new NutchDocument(i, details[i], summaries[i], 
defaultLanguage));
     }
   }
 
@@ -121,14 +120,4 @@
     return SUCCESSOR_CAPABILITIES;
   }
 
-  /**
-   * Converts a html chunk to plain text.
-   * 
-   * This method is only required because Nutch's summaries are in HTML.
-   * I guess it would be possible to get rid of the code below by
-   * adding patches/ methods to Nutch that return plain text summaries. 
-   */
-  private final String htmlToText(String html) {
-    return StringUtils.removeMarkup(html);
-  }
 }


Reply via email to