Author: lewismc
Date: Tue Feb 19 00:40:49 2013
New Revision: 1447562
URL: http://svn.apache.org/r1447562
Log:
* NUTCH-1420 Get rid of the dreaded � (markus via lewismc)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1447562&r1=1447561&r2=1447562&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Feb 19 00:40:49 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1420 Get rid of the dreaded � (markus via lewismc)
+
* NUTCH-XX remove unused db.max.inlinks property in nutch-default.xml (lewismc)
* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas
Patil)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java?rev=1447562&r1=1447561&r2=1447562&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java Tue Feb
19 00:40:49 2013
@@ -125,6 +125,13 @@ public class StringUtil {
public static boolean isEmpty(String str) {
return (str == null) || (str.equals(""));
}
+
+ /**
+ * Simple character substitution which cleans all � chars from a given
String.
+ */
+ public static String cleanField(String value) {
+ return value.replaceAll("�", "");
+ }
public static void main(String[] args) {
if (args.length != 1)
Modified:
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1447562&r1=1447561&r2=1447562&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Tue Feb 19 00:40:49 2013
@@ -33,6 +33,7 @@ import org.apache.nutch.indexer.NutchDoc
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Bytes;
+import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TableUtil;
import org.apache.solr.common.util.DateUtil;
@@ -105,7 +106,8 @@ public class BasicIndexingFilter impleme
}
// content is indexed, so that it's searchable, but not stored in index
- doc.add("content", TableUtil.toString(page.getText()));
+ String content = TableUtil.toString(page.getText());
+ doc.add("content", StringUtil.cleanField(content));
// title
String title = TableUtil.toString(page.getTitle());
@@ -114,7 +116,7 @@ public class BasicIndexingFilter impleme
}
if (title.length() > 0) {
// NUTCH-1004 Do not index empty values for title field
- doc.add("title", title);
+ doc.add("title", StringUtil.cleanField(title));
}
// add cached content/summary display policy, if available
ByteBuffer cachingRaw = page