Author: ferdy
Date: Thu May 10 12:45:58 2012
New Revision: 1336643

URL: http://svn.apache.org/viewvc?rev=1336643&view=rev
Log:
NUTCH-1026 Strip UTF-8 non-character codepoints

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/conf/log4j.properties
    
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1336643&r1=1336642&r2=1336643&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Thu May 10 12:45:58 2012
@@ -1,6 +1,8 @@
 Nutch Change Log
 
 Release nutchgora - Current Development
+* NUTCH-1026 Strip UTF-8 non-character codepoints (markus, ferdy)
+
 * NUTCH-1358 Do not accept bogus arguments (ferdy)
 
 * NUTCH-1349 Make batchId explcit within debug logging and improve CLI 
(lewismc + ferdy)

Modified: nutch/branches/nutchgora/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/log4j.properties?rev=1336643&r1=1336642&r2=1336643&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/log4j.properties (original)
+++ nutch/branches/nutchgora/conf/log4j.properties Thu May 10 12:45:58 2012
@@ -34,6 +34,7 @@ log4j.logger.org.apache.nutch.fetcher.Fe
 log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
 log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout

Modified: 
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=1336643&r1=1336642&r2=1336643&view=diff
==============================================================================
--- 
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java 
(original)
+++ 
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java 
Thu May 10 12:45:58 2012
@@ -21,6 +21,8 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Map.Entry;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.nutch.indexer.NutchDocument;
@@ -32,6 +34,8 @@ import org.apache.solr.common.SolrInputD
 
 public class SolrWriter implements NutchIndexWriter {
 
+  public static Logger LOG = LoggerFactory.getLogger(SolrWriter.class);
+
   private SolrServer solr;
   private SolrMappingReader solrMapping;
 
@@ -54,10 +58,16 @@ public class SolrWriter implements Nutch
     final SolrInputDocument inputDoc = new SolrInputDocument();
     for(final Entry<String, List<String>> e : doc) {
       for (final String val : e.getValue()) {
-        inputDoc.addField(solrMapping.mapKey(e.getKey()), val);
+
+        Object val2 = val;
+        if (e.getKey().equals("content")) {
+          val2 = stripNonCharCodepoints((String)val);
+        }
+
+        inputDoc.addField(solrMapping.mapKey(e.getKey()), val2);
         String sCopy = solrMapping.mapCopyKey(e.getKey());
         if (sCopy != e.getKey()) {
-               inputDoc.addField(sCopy, val);
+               inputDoc.addField(sCopy, val2);
         }
       }
     }
@@ -65,6 +75,7 @@ public class SolrWriter implements Nutch
     inputDocs.add(inputDoc);
     if (inputDocs.size() >= commitSize) {
       try {
+        LOG.info("Adding " + Integer.toString(inputDocs.size()) + " 
documents");
         solr.add(inputDocs);
       } catch (final SolrServerException e) {
         throw new IOException(e);
@@ -77,6 +88,7 @@ public class SolrWriter implements Nutch
   public void close() throws IOException {
     try {
       if (!inputDocs.isEmpty()) {
+        LOG.info("Adding " + Integer.toString(inputDocs.size()) + " 
documents");
         solr.add(inputDocs);
         inputDocs.clear();
       }
@@ -85,4 +97,25 @@ public class SolrWriter implements Nutch
     }
   }
 
+  public static String stripNonCharCodepoints(String input) {
+    StringBuilder retval = new StringBuilder();
+    char ch;
+
+    for (int i = 0; i < input.length(); i++) {
+      ch = input.charAt(i);
+
+      // Strip all non-characters 
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+      // and non-printable control characters except tabulator, new line and 
carriage return
+      if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
+          ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
+          (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
+          (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
+
+        retval.append(ch);
+      }
+    }
+
+    return retval.toString();
+  }
+
 }


Reply via email to