Author: lewismc
Date: Tue Feb 19 00:44:56 2013
New Revision: 1447563

URL: http://svn.apache.org/r1447563
Log:
* NUTCH-1420 Get rid of the dreaded � (markus via lewismc)

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
    
nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1447563&r1=1447562&r2=1447563&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 19 00:44:56 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1420 Get rid of the dreaded � (markus via lewismc)
+
 * NUTCH-1521 CrawlDbFilter pass null url to urlNormalizers (Lufeng via lewismc)
 
 * NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas 
Patil)

Modified: nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=1447563&r1=1447562&r2=1447563&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Tue Feb 19 
00:44:56 2013
@@ -125,6 +125,13 @@ public class StringUtil {
   public static boolean isEmpty(String str) {
     return (str == null) || (str.equals(""));
   }
+  
+  /**
+   * Simple character substitution which cleans all � chars from a given 
String.
+   */
+  public static String cleanField(String value) {
+    return value.replaceAll("�", "");
+  }
 
   public static void main(String[] args) {
     if (args.length != 1)

Modified: 
nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1447563&r1=1447562&r2=1447563&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Tue Feb 19 00:44:56 2013
@@ -26,6 +26,7 @@ import org.apache.nutch.parse.Parse;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.URLUtil;
 import org.apache.hadoop.io.Text;
 
@@ -103,7 +104,7 @@ public class BasicIndexingFilter impleme
     if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
       content = content.substring(0, MAX_CONTENT_LENGTH);
     }
-    doc.add("content", content);
+    doc.add("content", StringUtil.cleanField(content));
 
     // title
     String title = parse.getData().getTitle();
@@ -113,7 +114,7 @@ public class BasicIndexingFilter impleme
 
     if (title.length() > 0) {
       // NUTCH-1004 Do not index empty values for title field
-      doc.add("title", title);
+      doc.add("title", StringUtil.cleanField(title));
     }
 
     // add cached content/summary display policy, if available


Reply via email to