Author: markus
Date: Wed Jul 13 13:59:11 2011
New Revision: 1146035

URL: http://svn.apache.org/viewvc?rev=1146035&view=rev
Log:
NUTCH-987, NUTCH-1036 Solr HTTP auth support and Hadoop reporter counter 
increments

Added:
    
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
Modified:
    nutch/branches/branch-1.4/CHANGES.txt
    nutch/branches/branch-1.4/conf/nutch-default.xml
    
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
    
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
    
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
    
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java

Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Wed Jul 13 13:59:11 2011
@@ -2,6 +2,10 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-1036 Solr jobs should increment counters in Reporter (markus)
+
+* NUTCH-987 Support HTTP auth for Solr communication (markus)
+
 * NUTCH-1027 Degrade log level of `can't find rules for scope` (markus)
 
 * NUTCH-783 IndexingFiltersChecker utility (jnioche via markus)

Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Wed Jul 13 13:59:11 2011
@@ -1047,4 +1047,13 @@
   </description>  
 </property> 
 
+<property>
+  <name>solr.auth</name>
+  <value>false</value>
+  <description>
+  Whether to enable HTTP basic authentication for communicating with Solr.
+  Use the solr.auth.username and solr.auth.password properties to configure
+  your credentials.
+  </description>
+</property>
 </configuration>

Modified: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
 (original)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
 Wed Jul 13 13:59:11 2011
@@ -134,11 +134,15 @@ implements Mapper<Text, Writable, Text, 
       doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
     } catch (final IndexingException e) {
       if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
+      reporter.incrCounter("IndexerStatus", "Errors", 1);
       return;
     }
 
     // skip documents discarded by indexing filters
-    if (doc == null) return;
+    if (doc == null) {
+      reporter.incrCounter("IndexerStatus", "Skipped by filters", 1);
+      return;
+    }
 
     float boost = 1.0f;
     // run scoring filters
@@ -156,6 +160,8 @@ implements Mapper<Text, Writable, Text, 
     // store boost for use by explain and dedup
     doc.add("boost", Float.toString(boost));
 
+    reporter.incrCounter("IndexerStatus", "Documents added", 1);
+
     output.collect(key, doc);
   }
 

Modified: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java 
(original)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrClean.java 
Wed Jul 13 13:59:11 2011
@@ -45,7 +45,6 @@ import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.TimingUtil;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 
 /**
@@ -102,7 +101,7 @@ public class SolrClean implements Tool {
     @Override
     public void configure(JobConf job) {
       try {
-        solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+        solr = SolrUtils.getCommonsHttpSolrServer(job);
         noCommit = job.getBoolean("noCommit", false);
       } catch (MalformedURLException e) {
         throw new RuntimeException(e);
@@ -136,6 +135,7 @@ public class SolrClean implements Tool {
         Text document = values.next();
         updateRequest.deleteById(document.toString());
         numDeletes++;
+        reporter.incrCounter("SolrCleanStatus", "Deleted documents", 1);
         if (numDeletes >= NUM_MAX_DELETE_REQUEST) {
           try {
             LOG.info("SolrClean: deleting " + numDeletes + " documents");

Modified: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
 (original)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
 Wed Jul 13 13:59:11 2011
@@ -47,7 +47,6 @@ import org.apache.nutch.util.TimingUtil;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrDocument;
@@ -188,7 +187,7 @@ Tool {
 
     /** Return each index as a split. */
     public InputSplit[] getSplits(JobConf job, int numSplits) throws 
IOException {
-      SolrServer solr = new 
CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+      SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
 
       final SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
       solrQuery.setFields(SolrConstants.ID_FIELD);
@@ -219,7 +218,7 @@ Tool {
         Reporter reporter)
         throws IOException {
 
-      SolrServer solr = new 
CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+      SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
       SolrInputSplit solrSplit = (SolrInputSplit) split;
       final int numDocs = solrSplit.getNumDocs();
       
@@ -298,7 +297,7 @@ Tool {
 
   public void configure(JobConf job) {
     try {
-      solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+      solr = SolrUtils.getCommonsHttpSolrServer(job);
       noCommit = job.getBoolean("noCommit", false);
     } catch (MalformedURLException e) {
       throw new RuntimeException(e);
@@ -336,6 +335,7 @@ Tool {
         updateRequest.deleteById(solrRecord.id);
       }
       numDeletes++;
+      reporter.incrCounter("SolrDedupStatus", "Deleted documents", 1);
       if (numDeletes >= NUM_MAX_DELETE_REQUEST) {
         try {
           LOG.info("SolrDeleteDuplicates: deleting " + numDeletes + " 
duplicates");

Modified: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
 (original)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
 Wed Jul 13 13:59:11 2011
@@ -36,7 +36,6 @@ import org.apache.nutch.util.NutchConfig
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.TimingUtil;
 import org.apache.solr.client.solrj.SolrServer;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
 
 import java.io.IOException;
 import java.text.SimpleDateFormat;
@@ -84,7 +83,7 @@ public class SolrIndexer extends Configu
     try {
       JobClient.runJob(job);
       // do the commits once and for all the reducers in one go
-      SolrServer solr =  new CommonsHttpSolrServer(solrUrl);
+      SolrServer solr =  SolrUtils.getCommonsHttpSolrServer(job);
 
       if (!noCommit) {
         solr.commit();

Added: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java?rev=1146035&view=auto
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java 
(added)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrUtils.java 
Wed Jul 13 13:59:11 2011
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.solr;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.commons.httpclient.UsernamePasswordCredentials;
+import org.apache.commons.httpclient.params.HttpClientParams;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+
+import java.net.MalformedURLException;
+
+public class SolrUtils {
+
+  public static Log LOG = LogFactory.getLog(SolrIndexer.class);
+
+  public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job) 
throws MalformedURLException {
+    HttpClient client=new HttpClient();
+
+    // Check for username/password
+    if (job.getBoolean(SolrConstants.USE_AUTH, false)) {
+      String username = job.get(SolrConstants.USERNAME);
+
+      LOG.info("Authenticating as: " + username);
+
+      AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, 
AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
+
+      client.getState().setCredentials(scope, new 
UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
+
+      HttpClientParams params = client.getParams();
+      params.setAuthenticationPreemptive(true);
+
+      client.setParams(params);
+    }
+
+    return new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL), 
client);
+  }
+
+  public static String stripNonCharCodepoints(String input) {
+    StringBuilder retval = new StringBuilder();
+    char ch;
+
+    for (int i = 0; i < input.length(); i++) {
+      ch = input.charAt(i);
+
+      // Strip all non-characters 
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+      // and non-printable control characters except tabulator, new line and 
carriage return
+      if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
+          ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
+          (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
+          (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
+
+        retval.append(ch);
+      }
+    }
+
+    return retval.toString();
+  }
+}
\ No newline at end of file

Modified: 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=1146035&r1=1146034&r2=1146035&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
 (original)
+++ 
nutch/branches/branch-1.4/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
 Wed Jul 13 13:59:11 2011
@@ -30,7 +30,6 @@ import org.apache.nutch.indexer.NutchFie
 import org.apache.nutch.indexer.NutchIndexWriter;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.util.DateUtil;
 
@@ -47,7 +46,7 @@ public class SolrWriter implements Nutch
   private int commitSize;
 
   public void open(JobConf job, String name) throws IOException {
-    solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
+    solr = SolrUtils.getCommonsHttpSolrServer(job);
     commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
     solrMapping = SolrMappingReader.getInstance(job);
   }
@@ -64,7 +63,7 @@ public class SolrWriter implements Nutch
         }
 
         if (e.getKey().equals("content")) {
-          val2 = stripNonCharCodepoints((String)val);
+          val2 = SolrUtils.stripNonCharCodepoints((String)val);
         }
 
         inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, 
e.getValue().getWeight());
@@ -105,26 +104,4 @@ public class SolrWriter implements Nutch
     ioe.initCause(e);
     return ioe;
   }
-
-  public static String stripNonCharCodepoints(String input) {
-    StringBuilder retval = new StringBuilder();
-    char ch;
-
-    for (int i = 0; i < input.length(); i++) {
-      ch = input.charAt(i);
-
-      // Strip all non-characters 
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
-      // and non-printable control characters except tabulator, new line and 
carriage return
-      if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
-          ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
-          (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
-          (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
-
-        retval.append(ch);
-      }
-    }
-
-    return retval.toString();
-  }
-
 }


Reply via email to