This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit e0326de05197f8415eeb750d4d8fff764db87aa9 Author: Nicola Marcacci Rossi <[email protected]> AuthorDate: Fri Dec 15 14:18:57 2017 +0100 make fully configurable --- conf/nutch-default.xml | 20 ++++++++++++++-- .../elasticrest/ElasticRestConstants.java | 2 ++ .../elasticrest/ElasticRestIndexWriter.java | 28 ++++++++++++++++++---- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index bcb2e9e..1d9837f 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2122,12 +2122,28 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> A list of strings denoting the supported languages (e.g. `en,de,fr,it`). If this value is empty all documents will be sent to index ${elastic.rest.index}. If not empty the Rest client will distribute documents in different indices based on their `lang` property. - Indices are named with the following schema: ${elastic.rest.index}_${lang} (e.g. `nutch_de`). - Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}_others (e.g. `nutch_others`). + Indices are named with the following schema: ${elastic.rest.index}${elastic.rest.separator}${lang} (e.g. `nutch_de`). + Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}${elastic.rest.separator}${elastic.rest.sink} (e.g. `nutch_others`). </description> </property> <property> + <name>elastic.rest.separator</name> + <value>_</value> + <description> + Default value is `_`. Is used only if `elastic.rest.languages` is defined to build the index name (i.e. ${elastic.rest.index}${elastic.rest.separator}${lang}). + </description> +</property> + +<property> + <name>elastic.rest.sink</name> + <value>others</value> + <description> + Default value is `others`. Is used only if `elastic.rest.languages` is defined to build the index name where to store documents with unsupported languages (i.e. ${elastic.rest.index}${elastic.rest.separator}${elastic.rest.sink}). + </description> +</property> + +<property> <name>elastic.rest.type</name> <value>doc</value> <description>Default type to send documents to.</description> diff --git a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java index 74f37eb..c0f5fe7 100644 --- a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java +++ b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java @@ -32,4 +32,6 @@ public interface ElasticRestConstants { public static final String HOSTNAME_TRUST = ELASTIC_PREFIX + "trustallhostnames"; public static final String LANGUAGES = ELASTIC_PREFIX + "languages"; + public static final String SEPARATOR = ELASTIC_PREFIX + "separator"; + public static final String SINK = ELASTIC_PREFIX + "sink"; } diff --git a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java index 56cfab1..5e71b3c 100644 --- a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java +++ b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java @@ -67,7 +67,9 @@ public class ElasticRestIndexWriter implements IndexWriter { .getLogger(ElasticRestIndexWriter.class); private static final int DEFAULT_MAX_BULK_DOCS = 250; - private static final int DEFAULT_MAX_BULK_LENGTH = 2500500; + private static final int DEFAULT_MAX_BULK_LENGTH = 2500500; + private static final String DEFAULT_SEPARATOR = "_"; + private static final String DEFAULT_SINK = "others"; private JestClient client; private String defaultIndex; @@ -93,6 +95,8 @@ public class ElasticRestIndexWriter implements IndexWriter { private BasicFuture<JestResult> basicFuture = null; private String[] languages = null; + private String separator = null; + private String sink = null; @Override public void open(JobConf job, String name) throws IOException { @@ -104,6 +108,8 @@ public class ElasticRestIndexWriter implements IndexWriter { https = job.getBoolean(ElasticRestConstants.HTTPS, false); trustAllHostnames = job.getBoolean(ElasticRestConstants.HOSTNAME_TRUST, false); languages = job.getStrings(ElasticRestConstants.LANGUAGES); + separator = job.get(ElasticRestConstants.SEPARATOR, DEFAULT_SEPARATOR); + sink = job.get(ElasticRestConstants.SINK, DEFAULT_SINK); // trust ALL certificates SSLContext sslContext = null; @@ -205,9 +211,9 @@ public class ElasticRestIndexWriter implements IndexWriter { } } if (exists) { - index = defaultIndex + "_" + language; + index = getLanguageIndexName(language); } else { - index = defaultIndex + "_others"; + index = getSinkIndexName(); } } else { index = defaultIndex; @@ -237,9 +243,9 @@ public class ElasticRestIndexWriter implements IndexWriter { if (languages != null && languages.length > 0) { Bulk.Builder bulkBuilder = new Bulk.Builder().defaultType(defaultType); for (String lang : languages) { - bulkBuilder.addAction(new Delete.Builder(key).index(defaultIndex + "_" + lang).type(defaultType).build()); + bulkBuilder.addAction(new Delete.Builder(key).index(getLanguageIndexName(lang)).type(defaultType).build()); } - bulkBuilder.addAction(new Delete.Builder(key).index(defaultIndex + "_others").type(defaultType).build()); + bulkBuilder.addAction(new Delete.Builder(key).index(getSinkIndexName()).type(defaultType).build()); client.execute(bulkBuilder.build()); } else { client.execute(new Delete.Builder(key).index(defaultIndex) @@ -359,4 +365,16 @@ public class ElasticRestIndexWriter implements IndexWriter { public Configuration getConf() { return config; } + + private String getLanguageIndexName(String lang) { + return getComposedIndexName(defaultIndex, lang); + } + + private String getSinkIndexName() { + return getComposedIndexName(defaultIndex, sink); + } + + private String getComposedIndexName(String prefix, String postfix) { + return prefix + separator + postfix; + } } -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
