This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 194fc37cb5aa2879b279014bbeaf3bd207af85fd Author: Nicola Marcacci Rossi <[email protected]> AuthorDate: Wed Dec 13 16:33:00 2017 +0100 Extend indexer-elastic-rest to support languages --- .../elasticrest/ElasticRestConstants.java | 4 +- .../elasticrest/ElasticRestIndexWriter.java | 43 +++++++++++++++++----- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java index 322ff44..74f37eb 100644 --- a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java +++ b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java @@ -30,4 +30,6 @@ public interface ElasticRestConstants { public static final String TYPE = ELASTIC_PREFIX + "type"; public static final String HTTPS = ELASTIC_PREFIX + "https"; public static final String HOSTNAME_TRUST = ELASTIC_PREFIX + "trustallhostnames"; -} \ No newline at end of file + + public static final String LANGUAGES = ELASTIC_PREFIX + "languages"; +} diff --git a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java index 1364722..dc54058 100644 --- a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java +++ b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java @@ -32,7 +32,6 @@ import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.JobConf; -import org.apache.http.HttpResponse; import org.apache.http.concurrent.BasicFuture; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.NoopHostnameVerifier; @@ -48,7 +47,6 @@ import org.slf4j.LoggerFactory; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.SSLContext; -import java.io.BufferedReader; import java.io.IOException; import java.net.URL; import java.security.KeyManagementException; @@ -58,11 +56,9 @@ import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.HashMap; import java.util.Map; -import java.util.MissingResourceException; import java.util.HashSet; import java.util.Set; import java.util.concurrent.ExecutionException; -import java.util.concurrent.Future; /** */ @@ -80,7 +76,6 @@ public class ElasticRestIndexWriter implements IndexWriter { private Configuration config; private Bulk.Builder bulkBuilder; - private Future<HttpResponse> execute; private int port = -1; private String host = null; private Boolean https = null; @@ -96,6 +91,8 @@ public class ElasticRestIndexWriter implements IndexWriter { private boolean createNewBulk = false; private long millis; private BasicFuture<JestResult> basicFuture = null; + + private String[] languages = null; @Override public void open(JobConf job, String name) throws IOException { @@ -106,6 +103,7 @@ public class ElasticRestIndexWriter implements IndexWriter { password = job.get(ElasticRestConstants.PASSWORD); https = job.getBoolean(ElasticRestConstants.HTTPS, false); trustAllHostnames = job.getBoolean(ElasticRestConstants.HOSTNAME_TRUST, false); + languages = job.getStrings(ElasticRestConstants.LANGUAGES); // trust ALL certificates SSLContext sslContext = null; @@ -195,7 +193,26 @@ public class ElasticRestIndexWriter implements IndexWriter { bulkLength += fieldValues[0].length(); } } - Index indexRequest = new Index.Builder(source).index(defaultIndex) + + String index; + if (languages != null && languages.length > 0) { + String language = (String) doc.getFieldValue("lang"); + boolean exists = false; + for (String lang : languages) { + if (lang.equals(language)) { + exists = true; + break; + } + } + if (exists) { + index = defaultIndex + "_" + language; + } else { + index = defaultIndex + "_others"; + } + } else { + index = defaultIndex; + } + Index indexRequest = new Index.Builder(source).index(index) .type(type).id(id).build(); // Add this indexing request to a bulk request @@ -217,13 +234,21 @@ public class ElasticRestIndexWriter implements IndexWriter { @Override public void delete(String key) throws IOException { try { - client.execute(new Delete.Builder(key).index(defaultIndex) - .type(defaultType).build()); + if (languages != null && languages.length > 0) { + Bulk.Builder bulkBuilder = new Bulk.Builder().defaultType(defaultType); + for (String lang : languages) { + bulkBuilder.addAction(new Delete.Builder(key).index(defaultIndex + "_" + lang).build()); + } + bulkBuilder.addAction(new Delete.Builder(key).index(defaultIndex + "_others").build()); + client.execute(bulkBuilder.build()); + } else { + client.execute(new Delete.Builder(key).index(defaultIndex) + .type(defaultType).build()); + } } catch (IOException e) { LOG.error(ExceptionUtils.getStackTrace(e)); throw e; } - } @Override -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
