Author: ferdy
Date: Thu May 10 12:45:58 2012
New Revision: 1336643
URL: http://svn.apache.org/viewvc?rev=1336643&view=rev
Log:
NUTCH-1026 Strip UTF-8 non-character codepoints
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/conf/log4j.properties
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Modified: nutch/branches/nutchgora/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1336643&r1=1336642&r2=1336643&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Thu May 10 12:45:58 2012
@@ -1,6 +1,8 @@
Nutch Change Log
Release nutchgora - Current Development
+* NUTCH-1026 Strip UTF-8 non-character codepoints (markus, ferdy)
+
* NUTCH-1358 Do not accept bogus arguments (ferdy)
* NUTCH-1349 Make batchId explcit within debug logging and improve CLI
(lewismc + ferdy)
Modified: nutch/branches/nutchgora/conf/log4j.properties
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/log4j.properties?rev=1336643&r1=1336642&r2=1336643&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/log4j.properties (original)
+++ nutch/branches/nutchgora/conf/log4j.properties Thu May 10 12:45:58 2012
@@ -34,6 +34,7 @@ log4j.logger.org.apache.nutch.fetcher.Fe
log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout
log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=1336643&r1=1336642&r2=1336643&view=diff
==============================================================================
---
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
(original)
+++
nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Thu May 10 12:45:58 2012
@@ -21,6 +21,8 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.nutch.indexer.NutchDocument;
@@ -32,6 +34,8 @@ import org.apache.solr.common.SolrInputD
public class SolrWriter implements NutchIndexWriter {
+ public static Logger LOG = LoggerFactory.getLogger(SolrWriter.class);
+
private SolrServer solr;
private SolrMappingReader solrMapping;
@@ -54,10 +58,16 @@ public class SolrWriter implements Nutch
final SolrInputDocument inputDoc = new SolrInputDocument();
for(final Entry<String, List<String>> e : doc) {
for (final String val : e.getValue()) {
- inputDoc.addField(solrMapping.mapKey(e.getKey()), val);
+
+ Object val2 = val;
+ if (e.getKey().equals("content")) {
+ val2 = stripNonCharCodepoints((String)val);
+ }
+
+ inputDoc.addField(solrMapping.mapKey(e.getKey()), val2);
String sCopy = solrMapping.mapCopyKey(e.getKey());
if (sCopy != e.getKey()) {
- inputDoc.addField(sCopy, val);
+ inputDoc.addField(sCopy, val2);
}
}
}
@@ -65,6 +75,7 @@ public class SolrWriter implements Nutch
inputDocs.add(inputDoc);
if (inputDocs.size() >= commitSize) {
try {
+ LOG.info("Adding " + Integer.toString(inputDocs.size()) + "
documents");
solr.add(inputDocs);
} catch (final SolrServerException e) {
throw new IOException(e);
@@ -77,6 +88,7 @@ public class SolrWriter implements Nutch
public void close() throws IOException {
try {
if (!inputDocs.isEmpty()) {
+ LOG.info("Adding " + Integer.toString(inputDocs.size()) + "
documents");
solr.add(inputDocs);
inputDocs.clear();
}
@@ -85,4 +97,25 @@ public class SolrWriter implements Nutch
}
}
+ public static String stripNonCharCodepoints(String input) {
+ StringBuilder retval = new StringBuilder();
+ char ch;
+
+ for (int i = 0; i < input.length(); i++) {
+ ch = input.charAt(i);
+
+ // Strip all non-characters
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+ // and non-printable control characters except tabulator, new line and
carriage return
+ if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
+ ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
+ (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
+ (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
+
+ retval.append(ch);
+ }
+ }
+
+ return retval.toString();
+ }
+
}