[
https://issues.apache.org/jira/browse/NUTCH-2600?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16519448#comment-16519448
]
ASF GitHub Bot commented on NUTCH-2600:
---------------------------------------
sebastian-nagel closed pull request #351: fix for NUTCH-2600: indexer-solr
refactoring
URL: https://github.com/apache/nutch/pull/351
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/conf/index-writers.xml.template b/conf/index-writers.xml.template
index 2e4a1350b..6177073de 100644
--- a/conf/index-writers.xml.template
+++ b/conf/index-writers.xml.template
@@ -22,10 +22,10 @@
<writer id="indexer_solr_1"
class="org.apache.nutch.indexwriter.solr.SolrIndexWriter">
<parameters>
<param name="type" value="http"/>
- <!-- Solr URL (default core name is "nutch" but you may change it): -->
<param name="url" value="http://localhost:8983/solr/nutch"/>
- <param name="commitSize" value="250"/>
- <param name="commitIndex" value="true"/>
+ <param name="collection" value=""/>
+ <param name="weight.field" value=""/>
+ <param name="commitSize" value="1000"/>
<param name="auth" value="false"/>
<param name="username" value="username"/>
<param name="password" value="password"/>
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java
b/src/java/org/apache/nutch/indexer/IndexingJob.java
index ae77da4a0..67b7e0ba0 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -125,9 +125,6 @@ public void index(Path crawlDb, Path linkDb, List<Path>
segments,
IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job,
addBinaryContent);
- // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
- // job.set(SolrConstants.SERVER_URL, solrUrl);
-
conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
conf.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
conf.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml
index 65e97e7f6..7a02dc4dc 100644
--- a/src/plugin/indexer-solr/ivy.xml
+++ b/src/plugin/indexer-solr/ivy.xml
@@ -36,9 +36,9 @@
</publications>
<dependencies>
- <dependency org="org.apache.solr" name="solr-solrj" rev="5.5.0"/>
- <dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.1"
conf="*->default"/>
- <dependency org="org.apache.httpcomponents" name="httpmime" rev="4.4.1"
conf="*->default"/>
+ <dependency org="org.apache.solr" name="solr-solrj" rev="7.3.1"/>
+ <dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.6"
conf="*->default"/>
+ <dependency org="org.apache.httpcomponents" name="httpmime" rev="4.5.3"
conf="*->default"/>
</dependencies>
</ivy-module>
diff --git a/src/plugin/indexer-solr/plugin.xml
b/src/plugin/indexer-solr/plugin.xml
index a2f2f67a4..0d91eab86 100644
--- a/src/plugin/indexer-solr/plugin.xml
+++ b/src/plugin/indexer-solr/plugin.xml
@@ -22,15 +22,15 @@
<library name="indexer-solr.jar">
<export name="*" />
</library>
- <library name="commons-io-2.4.jar"/>
- <library name="httpclient-4.4.1.jar"/>
- <library name="httpcore-4.4.1.jar"/>
- <library name="httpmime-4.4.1.jar"/>
- <library name="noggit-0.6.jar"/>
- <library name="solr-solrj-5.5.0.jar"/>
+ <library name="commons-io-2.5.jar"/>
+ <library name="httpclient-4.5.3.jar"/>
+ <library name="httpcore-4.4.6.jar"/>
+ <library name="httpmime-4.5.3.jar"/>
+ <library name="noggit-0.8.jar"/>
+ <library name="solr-solrj-7.3.1.jar"/>
<library name="stax2-api-3.1.4.jar"/>
<library name="woodstox-core-asl-4.4.1.jar"/>
- <library name="zookeeper-3.4.6.jar"/>
+ <library name="zookeeper-3.4.11.jar"/>
</runtime>
<requires>
diff --git
a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
index 5c793110e..302ed75ed 100644
---
a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
+++
b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
@@ -17,43 +17,21 @@
package org.apache.nutch.indexwriter.solr;
public interface SolrConstants {
- public static final String SOLR_PREFIX = "solr.";
- public static final String SERVER_TYPE = "type";
+ String SERVER_TYPE = "type";
- public static final String SERVER_URL = "url";
+ String SERVER_URLS = "url";
- public static final String COMMIT_SIZE = "commitSize";
+ String COLLECTION = "collection";
- public static final String MAPPING_FILE = SOLR_PREFIX + "mapping.file";
+ String COMMIT_SIZE = "commitSize";
- public static final String USE_AUTH = "auth";
+ String WEIGHT_FIELD = "weight.field";
- public static final String USERNAME = "username";
+ String USE_AUTH = "auth";
- public static final String PASSWORD = "password";
+ String USERNAME = "username";
- public static final String LOAD_BALANCE_URL = "loadbalanceURL";
+ String PASSWORD = "password";
- public static final String COLLECTION = "collection";
-
- public static final String ZOOKEEPER_HOSTS = SOLR_PREFIX + "zookeeper.hosts";
-
- public static final String ID_FIELD = "id";
-
- public static final String URL_FIELD = "url";
-
- public static final String BOOST_FIELD = "boost";
-
- public static final String TIMESTAMP_FIELD = "tstamp";
-
- public static final String DIGEST_FIELD = "digest";
-
-
-
- @Deprecated
- public static final String COMMIT_INDEX = "commitIndex";
-
- @Deprecated
- public static final String PARAMS = SOLR_PREFIX + "params";
}
diff --git
a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
index 09a096a84..24172c6bc 100644
---
a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
+++
b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
@@ -20,7 +20,9 @@
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
+import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Map.Entry;
@@ -34,17 +36,12 @@
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
-import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
-import org.apache.solr.common.util.DateUtil;
-import org.apache.solr.common.util.NamedList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-// WORK AROUND FOR NOT REMOVING URL ENCODED URLS!!!
-
public class SolrIndexWriter implements IndexWriter {
private static final Logger LOG = LoggerFactory
@@ -55,21 +52,23 @@
private Configuration config;
- private final List<SolrInputDocument> inputDocs = new
ArrayList<SolrInputDocument>();
-
- private final List<SolrInputDocument> updateDocs = new
ArrayList<SolrInputDocument>();
+ private final List<SolrInputDocument> inputDocs = new ArrayList<>();
- private final List<String> deleteIds = new ArrayList<String>();
+ private final List<String> deleteIds = new ArrayList<>();
private int batchSize;
private int numDeletes = 0;
private int totalAdds = 0;
private int totalDeletes = 0;
- private int totalUpdates = 0;
private boolean delete = false;
+ private String weightField;
+
+ private boolean auth;
+ private String username;
+ private String password;
@Override
- public void open(Configuration conf, String name) throws IOException {
+ public void open(Configuration conf, String name) {
//Implementation not required
}
@@ -77,13 +76,12 @@ public void open(Configuration conf, String name) throws
IOException {
* Initializes the internal variables from a given index writer
configuration.
*
* @param parameters Params from the index writer configuration.
- * @throws IOException Some exception thrown by writer.
*/
@Override
- public void open(IndexWriterParams parameters) throws IOException {
- String type = parameters.get("type", "http");
+ public void open(IndexWriterParams parameters) {
+ String type = parameters.get(SolrConstants.SERVER_TYPE, "http");
- String[] urls = parameters.getStrings("url");
+ String[] urls = parameters.getStrings(SolrConstants.SERVER_URLS);
if (urls == null) {
String message = "Missing SOLR URL.\n" + describe();
@@ -91,6 +89,10 @@ public void open(IndexWriterParams parameters) throws
IOException {
throw new RuntimeException(message);
}
+ this.auth = parameters.getBoolean(SolrConstants.USE_AUTH, false);
+ this.username = parameters.get(SolrConstants.USERNAME);
+ this.password = parameters.get(SolrConstants.PASSWORD);
+
this.solrClients = new ArrayList<>();
switch (type) {
@@ -100,11 +102,12 @@ public void open(IndexWriterParams parameters) throws
IOException {
}
break;
case "cloud":
- for (String url : urls) {
- CloudSolrClient sc = SolrUtils.getCloudSolrClient(url);
- sc.setDefaultCollection(parameters.get(SolrConstants.COLLECTION));
- solrClients.add(sc);
- }
+ CloudSolrClient sc = this.auth ?
+ SolrUtils.getCloudSolrClient(Arrays.asList(urls), this.username,
+ this.password) :
+ SolrUtils.getCloudSolrClient(Arrays.asList(urls));
+ sc.setDefaultCollection(parameters.get(SolrConstants.COLLECTION));
+ solrClients.add(sc);
break;
case "concurrent":
// TODO: 1/08/17 Implement this
@@ -123,9 +126,10 @@ public void open(IndexWriterParams parameters) throws
IOException {
}
private void init(IndexWriterParams properties) {
- batchSize = Integer
- .parseInt(properties.getOrDefault(SolrConstants.COMMIT_SIZE, "1000"));
+ batchSize = properties.getInt(SolrConstants.COMMIT_SIZE, 1000);
delete = config.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
+ weightField = properties.get(SolrConstants.WEIGHT_FIELD, "");
+
// parse optional params
params = new ModifiableSolrParams();
String paramString = config.get(IndexerMapReduce.INDEXER_PARAMS);
@@ -166,18 +170,6 @@ public void delete(String key) throws IOException {
}
- public void deleteByQuery(String query) throws IOException {
- try {
- LOG.info("SolrWriter: deleting " + query);
- for (SolrClient solrClient : solrClients) {
- solrClient.deleteByQuery(query);
- }
- } catch (final SolrServerException e) {
- LOG.error("Error deleting: " + deleteIds);
- throw makeIOException(e);
- }
- }
-
@Override
public void update(NutchDocument doc) throws IOException {
write(doc);
@@ -192,18 +184,20 @@ public void write(NutchDocument doc) throws IOException {
Object val2 = val;
if (val instanceof Date) {
- val2 = DateUtil.getThreadLocalDateFormat().format(val);
+ val2 = DateTimeFormatter.ISO_INSTANT.format(((Date)
val).toInstant());
}
if (e.getKey().equals("content") || e.getKey().equals("title")) {
val2 = SolrUtils.stripNonCharCodepoints((String) val);
}
- inputDoc.addField(e.getKey(), val2, e.getValue().getWeight());
+ inputDoc.addField(e.getKey(), val2);
}
}
- inputDoc.setDocumentBoost(doc.getWeight());
+ if (!weightField.isEmpty()) {
+ inputDoc.addField(weightField, doc.getWeight());
+ }
inputDocs.add(inputDoc);
totalAdds++;
@@ -225,14 +219,21 @@ public void commit() throws IOException {
push();
try {
for (SolrClient solrClient : solrClients) {
- solrClient.commit();
+ if (this.auth) {
+ UpdateRequest req = new UpdateRequest();
+ req.setAction(UpdateRequest.ACTION.COMMIT, true, true);
+ req.setBasicAuthCredentials(this.username, this.password);
+ solrClient.request(req);
+ } else {
+ solrClient.commit();
+ }
}
} catch (final SolrServerException e) {
- LOG.error("Failed to commit solr connection: " + e.getMessage()); //
FIXME
+ LOG.error("Failed to commit solr connection: " + e.getMessage());
}
}
- public void push() throws IOException {
+ private void push() throws IOException {
if (inputDocs.size() > 0) {
try {
LOG.info(
@@ -242,10 +243,13 @@ public void push() throws IOException {
numDeletes = 0;
UpdateRequest req = new UpdateRequest();
req.add(inputDocs);
- req.setAction(AbstractUpdateRequest.ACTION.OPTIMIZE, false, false);
+ req.setAction(UpdateRequest.ACTION.OPTIMIZE, false, false);
req.setParams(params);
+ if (this.auth) {
+ req.setBasicAuthCredentials(this.username, this.password);
+ }
for (SolrClient solrClient : solrClients) {
- NamedList res = solrClient.request(req);
+ solrClient.request(req);
}
} catch (final SolrServerException e) {
throw makeIOException(e);
@@ -269,10 +273,8 @@ public void push() throws IOException {
}
}
- public static IOException makeIOException(SolrServerException e) {
- final IOException ioe = new IOException();
- ioe.initCause(e);
- return ioe;
+ private static IOException makeIOException(SolrServerException e) {
+ return new IOException(e);
}
@Override
@@ -295,7 +297,7 @@ public String describe() {
StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
sb.append("\t").append(SolrConstants.SERVER_TYPE).append(
" : Type of the server. Can be: \"cloud\", \"concurrent\", \"http\" or
\"lb\"\n");
- sb.append("\t").append(SolrConstants.SERVER_URL)
+ sb.append("\t").append(SolrConstants.SERVER_URLS)
.append(" : URL of the SOLR instance or URL of the Zookeeper
quorum\n");
sb.append("\t").append(SolrConstants.COMMIT_SIZE)
.append(" : buffer size when sending to SOLR (default 1000)\n");
diff --git
a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
index 74b405394..196fc5d1c 100644
---
a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
+++
b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
@@ -16,64 +16,50 @@
*/
package org.apache.nutch.indexwriter.solr;
-import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
+import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.CredentialsProvider;
+import org.apache.http.client.HttpClient;
+import org.apache.http.impl.client.BasicCredentialsProvider;
+import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
-import java.net.MalformedURLException;
+import java.util.List;
public class SolrUtils {
- private static final Logger LOG = LoggerFactory
- .getLogger(MethodHandles.lookup().lookupClass());
-
- /**
- * @param conf
- * @return SolrClient
- */
- public static ArrayList<SolrClient> getSolrClients(Configuration conf)
- throws MalformedURLException {
- String[] urls = conf.getStrings(SolrConstants.SERVER_URL);
- String[] zkHostString = conf.getStrings(SolrConstants.ZOOKEEPER_HOSTS);
- ArrayList<SolrClient> solrClients = new ArrayList<SolrClient>();
-
- if (zkHostString != null && zkHostString.length > 0) {
- for (int i = 0; i < zkHostString.length; i++) {
- CloudSolrClient sc = getCloudSolrClient(zkHostString[i]);
- sc.setDefaultCollection(conf.get(SolrConstants.COLLECTION));
- solrClients.add(sc);
- }
- } else {
- for (int i = 0; i < urls.length; i++) {
- SolrClient sc = new HttpSolrClient(urls[i]);
- solrClients.add(sc);
- }
- }
-
- return solrClients;
- }
-
- public static CloudSolrClient getCloudSolrClient(String url)
- throws MalformedURLException {
- CloudSolrClient sc = new CloudSolrClient(url.replace('|', ','));
- sc.setParallelUpdates(true);
+ static CloudSolrClient getCloudSolrClient(List<String> urls) {
+ CloudSolrClient sc = new CloudSolrClient.Builder(urls)
+ .withParallelUpdates(true).build();
sc.connect();
return sc;
}
- public static SolrClient getHttpSolrClient(String url)
- throws MalformedURLException {
- SolrClient sc = new HttpSolrClient(url);
+ static CloudSolrClient getCloudSolrClient(List<String> urls, String
username, String password) {
+ // Building http client
+ CredentialsProvider provider = new BasicCredentialsProvider();
+ UsernamePasswordCredentials credentials
+ = new UsernamePasswordCredentials(username, password);
+ provider.setCredentials(AuthScope.ANY, credentials);
+
+ HttpClient client = HttpClientBuilder.create()
+ .setDefaultCredentialsProvider(provider)
+ .build();
+
+ // Building the client
+ CloudSolrClient sc = new CloudSolrClient.Builder(urls)
+ .withParallelUpdates(true).withHttpClient(client).build();
+ sc.connect();
return sc;
}
- public static String stripNonCharCodepoints(String input) {
+ static SolrClient getHttpSolrClient(String url) {
+ return new HttpSolrClient.Builder(url).build();
+ }
+
+ static String stripNonCharCodepoints(String input) {
StringBuilder retval = new StringBuilder();
char ch;
@@ -95,5 +81,4 @@ public static String stripNonCharCodepoints(String input) {
return retval.toString();
}
-
}
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Refactoring indexer-solr
> ------------------------
>
> Key: NUTCH-2600
> URL: https://issues.apache.org/jira/browse/NUTCH-2600
> Project: Nutch
> Issue Type: Improvement
> Components: indexer, plugin
> Affects Versions: 1.14
> Reporter: Roannel Fernández Hernández
> Priority: Minor
> Fix For: 1.15
>
>
> indexer-solr includes an interface to define the parameters used by the
> plugin, however, in some cases this interface is not used and a string is
> used instead. Also, in index-writers.xml file, the commit.index parameter is
> never used in the code and the collection parameter is not included in
> index-writers.xml file.
> On the other hand, according to the configuration of indexer-solr plugin, it
> seems to support Basic Authentication but the username and password are never
> used. I don't know the reason for this, but I believe that it could be a good
> feature for Nutch and besides I think we should update Solrj library.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)