Author: markus
Date: Wed Jul 13 13:59:11 2011
New Revision: 1146035
URL: http://svn.apache.org/viewvc?rev=1146035&view=rev
Log:
NUTCH-987, NUTCH-1036 Solr HTTP auth support and Hadoop reporter counter
increments
Added:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
Modified:
nutch/branches/branch-1.4/CHANGES.txt
nutch/branches/branch-1.4/conf/nutch-default.xml
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Modified: nutch/branches/branch-1.4/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Wed Jul 13 13:59:11 2011
@@ -2,6 +2,10 @@ Nutch Change Log
Release 1.4 - Current development
+* NUTCH-1036 Solr jobs should increment counters in Reporter (markus)
+
+* NUTCH-987 Support HTTP auth for Solr communication (markus)
+
* NUTCH-1027 Degrade log level of `can't find rules for scope` (markus)
* NUTCH-783 IndexingFiltersChecker utility (jnioche via markus)
Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Wed Jul 13 13:59:11 2011
@@ -1047,4 +1047,13 @@
</description>
</property>
+<property>
+ <name>solr.auth</name>
+ <value>false</value>
+ <description>
+ Whether to enable HTTP basic authentication for communicating with Solr.
+ Use the solr.auth.username and solr.auth.password properties to configure
+ your credentials.
+ </description>
+</property>
</configuration>
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Wed Jul 13 13:59:11 2011
@@ -134,11 +134,15 @@ implements Mapper<Text, Writable, Text,
doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
} catch (final IndexingException e) {
if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
+ reporter.incrCounter("IndexerStatus", "Errors", 1);
return;
}
// skip documents discarded by indexing filters
- if (doc == null) return;
+ if (doc == null) {
+ reporter.incrCounter("IndexerStatus", "Skipped by filters", 1);
+ return;
+ }
float boost = 1.0f;
// run scoring filters
@@ -156,6 +160,8 @@ implements Mapper<Text, Writable, Text,
// store boost for use by explain and dedup
doc.add("boost", Float.toString(boost));
+ reporter.incrCounter("IndexerStatus", "Documents added", 1);
+
output.collect(key, doc);
}
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
Wed Jul 13 13:59:11 2011
@@ -45,7 +45,6 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.request.UpdateRequest;
/**
@@ -102,7 +101,7 @@ public class SolrClean implements Tool {
@Override
public void configure(JobConf job) {
try {
- solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+ solr = SolrUtils.getCommonsHttpSolrServer(job);
noCommit = job.getBoolean("noCommit", false);
} catch (MalformedURLException e) {
throw new RuntimeException(e);
@@ -136,6 +135,7 @@ public class SolrClean implements Tool {
Text document = values.next();
updateRequest.deleteById(document.toString());
numDeletes++;
+ reporter.incrCounter("SolrCleanStatus", "Deleted documents", 1);
if (numDeletes >= NUM_MAX_DELETE_REQUEST) {
try {
LOG.info("SolrClean: deleting " + numDeletes + " documents");
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
Wed Jul 13 13:59:11 2011
@@ -47,7 +47,6 @@ import org.apache.nutch.util.TimingUtil;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
@@ -188,7 +187,7 @@ Tool {
/** Return each index as a split. */
public InputSplit[] getSplits(JobConf job, int numSplits) throws
IOException {
- SolrServer solr = new
CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+ SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
final SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
solrQuery.setFields(SolrConstants.ID_FIELD);
@@ -219,7 +218,7 @@ Tool {
Reporter reporter)
throws IOException {
- SolrServer solr = new
CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+ SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
SolrInputSplit solrSplit = (SolrInputSplit) split;
final int numDocs = solrSplit.getNumDocs();
@@ -298,7 +297,7 @@ Tool {
public void configure(JobConf job) {
try {
- solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+ solr = SolrUtils.getCommonsHttpSolrServer(job);
noCommit = job.getBoolean("noCommit", false);
} catch (MalformedURLException e) {
throw new RuntimeException(e);
@@ -336,6 +335,7 @@ Tool {
updateRequest.deleteById(solrRecord.id);
}
numDeletes++;
+ reporter.incrCounter("SolrDedupStatus", "Deleted documents", 1);
if (numDeletes >= NUM_MAX_DELETE_REQUEST) {
try {
LOG.info("SolrDeleteDuplicates: deleting " + numDeletes + "
duplicates");
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
Wed Jul 13 13:59:11 2011
@@ -36,7 +36,6 @@ import org.apache.nutch.util.NutchConfig
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.solr.client.solrj.SolrServer;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import java.io.IOException;
import java.text.SimpleDateFormat;
@@ -84,7 +83,7 @@ public class SolrIndexer extends Configu
try {
JobClient.runJob(job);
// do the commits once and for all the reducers in one go
- SolrServer solr = new CommonsHttpSolrServer(solrUrl);
+ SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
if (!noCommit) {
solr.commit();
Added:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java?rev=1146035&view=auto
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
(added)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
Wed Jul 13 13:59:11 2011
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.solr;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.commons.httpclient.UsernamePasswordCredentials;
+import org.apache.commons.httpclient.params.HttpClientParams;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+
+import java.net.MalformedURLException;
+
+public class SolrUtils {
+
+ public static Log LOG = LogFactory.getLog(SolrIndexer.class);
+
+ public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job)
throws MalformedURLException {
+ HttpClient client=new HttpClient();
+
+ // Check for username/password
+ if (job.getBoolean(SolrConstants.USE_AUTH, false)) {
+ String username = job.get(SolrConstants.USERNAME);
+
+ LOG.info("Authenticating as: " + username);
+
+ AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT,
AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
+
+ client.getState().setCredentials(scope, new
UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
+
+ HttpClientParams params = client.getParams();
+ params.setAuthenticationPreemptive(true);
+
+ client.setParams(params);
+ }
+
+ return new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL),
client);
+ }
+
+ public static String stripNonCharCodepoints(String input) {
+ StringBuilder retval = new StringBuilder();
+ char ch;
+
+ for (int i = 0; i < input.length(); i++) {
+ ch = input.charAt(i);
+
+ // Strip all non-characters
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+ // and non-printable control characters except tabulator, new line and
carriage return
+ if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
+ ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
+ (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
+ (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
+
+ retval.append(ch);
+ }
+ }
+
+ return retval.toString();
+ }
+}
\ No newline at end of file
Modified:
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
(original)
+++
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Wed Jul 13 13:59:11 2011
@@ -30,7 +30,6 @@ import org.apache.nutch.indexer.NutchFie
import org.apache.nutch.indexer.NutchIndexWriter;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.DateUtil;
@@ -47,7 +46,7 @@ public class SolrWriter implements Nutch
private int commitSize;
public void open(JobConf job, String name) throws IOException {
- solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+ solr = SolrUtils.getCommonsHttpSolrServer(job);
commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
solrMapping = SolrMappingReader.getInstance(job);
}
@@ -64,7 +63,7 @@ public class SolrWriter implements Nutch
}
if (e.getKey().equals("content")) {
- val2 = stripNonCharCodepoints((String)val);
+ val2 = SolrUtils.stripNonCharCodepoints((String)val);
}
inputDoc.addField(solrMapping.mapKey(e.getKey()), val2,
e.getValue().getWeight());
@@ -105,26 +104,4 @@ public class SolrWriter implements Nutch
ioe.initCause(e);
return ioe;
}
-
- public static String stripNonCharCodepoints(String input) {
- StringBuilder retval = new StringBuilder();
- char ch;
-
- for (int i = 0; i < input.length(); i++) {
- ch = input.charAt(i);
-
- // Strip all non-characters
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
- // and non-printable control characters except tabulator, new line and
carriage return
- if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
- ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
- (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
- (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
-
- retval.append(ch);
- }
- }
-
- return retval.toString();
- }
-
}