[
https://issues.apache.org/jira/browse/NUTCH-2399?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16218795#comment-16218795
]
ASF GitHub Bot commented on NUTCH-2399:
---------------------------------------
jorgelbg closed pull request #200: NUTCH-2399: Support multivalue fields.
URL: https://github.com/apache/nutch/pull/200
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/build.xml b/build.xml
index 2d33f96ac..2455f77b6 100644
--- a/build.xml
+++ b/build.xml
@@ -1041,6 +1041,8 @@
<source path="${plugins.dir}/indexer-elastic/src/java/" />
<source path="${plugins.dir}/indexer-elastic/src/test/" />
<source path="${plugins.dir}/indexer-elastic-rest/src/java/"/>
+ <source path="${plugins.dir}/index-jexl-filter/src/java/" />
+ <source path="${plugins.dir}/index-jexl-filter/src/test/" />
<source path="${plugins.dir}/index-metadata/src/java/" />
<source path="${plugins.dir}/index-more/src/java/" />
<source path="${plugins.dir}/index-more/src/test/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index c406907c5..a78635252 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1566,6 +1566,34 @@ visit
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
</description>
</property>
+<property>
+ <name>lang.index.languages</name>
+ <value></value>
+ <description>If not empty, should be a comma separated list of language
codes.
+ Only documents with one of these language codes will be indexed.
+ "unknown" is a valid language code, will match documents where language
+ detection failed.
+ </description>
+</property>
+
+<!-- index-jexl-filter plugin properties -->
+
+<property>
+ <name>index.jexl.filter</name>
+ <value></value>
+ <description> A JEXL expression. If it evaluates to false,
+ the document will not be indexed.
+ Available primitives in the JEXL context:
+ * status, fetchTime, modifiedTime, retries, interval, score, signature, url,
text, title
+ Available objects in the JEXL context:
+ * httpStatus - contains majorCode, minorCode, message
+ * documentMeta, contentMeta, parseMeta - contain all the Metadata properties.
+ each property value is always an array of Strings (so if you expect one
value, use [0])
+ * doc - contains all the NutchFields from the NutchDocument.
+ each property value is always an array of Objects.
+ </description>
+</property>
+
<!-- index-static plugin properties -->
<property>
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 9074654de..7ea63ce8a 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -32,6 +32,7 @@
<ant dir="index-basic" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-geoip" target="deploy"/>
+ <ant dir="index-jexl-filter" target="deploy"/>
<ant dir="index-more" target="deploy"/>
<ant dir="index-replace" target="deploy"/>
<ant dir="index-static" target="deploy"/>
@@ -153,6 +154,7 @@
<ant dir="index-basic" target="clean"/>
<ant dir="index-anchor" target="clean"/>
<ant dir="index-geoip" target="clean"/>
+ <ant dir="index-jexl-filter" target="clean"/>
<ant dir="index-more" target="clean"/>
<ant dir="index-static" target="clean"/>
<ant dir="index-replace" target="clean"/>
diff --git a/src/plugin/index-jexl-filter/build.xml
b/src/plugin/index-jexl-filter/build.xml
new file mode 100644
index 000000000..7aa7be24d
--- /dev/null
+++ b/src/plugin/index-jexl-filter/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-jexl-filter" default="jar-core">
+
+ <import file="../build-plugin.xml" />
+
+ </project>
diff --git a/src/plugin/index-jexl-filter/ivy.xml
b/src/plugin/index-jexl-filter/ivy.xml
new file mode 100644
index 000000000..0a363f774
--- /dev/null
+++ b/src/plugin/index-jexl-filter/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/index-jexl-filter/plugin.xml
b/src/plugin/index-jexl-filter/plugin.xml
new file mode 100644
index 000000000..9bb543820
--- /dev/null
+++ b/src/plugin/index-jexl-filter/plugin.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="index-jexl-filter"
+ name="Filter indexed documents according to a JEXL expression"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="index-jexl-filter.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.indexer.filter"
+ name="Nutch JEXL filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="JexlIndexingFilter"
+
class="org.apache.nutch.indexer.filter.JexlIndexingFilter"/>
+ </extension>
+
+</plugin>
diff --git
a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/filter/JexlIndexingFilter.java
b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/filter/JexlIndexingFilter.java
new file mode 100644
index 000000000..233ead3b3
--- /dev/null
+++
b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/filter/JexlIndexingFilter.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Map.Entry;
+
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.MapContext;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.JexlUtil;
+import org.apache.nutch.util.StringUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering of
+ * documents based on a JEXL expression.
+ *
+ */
+public class JexlIndexingFilter implements IndexingFilter {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ private Configuration conf;
+ private Expression expr;
+
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks)
+ throws IndexingException {
+ // Create a context and add data
+ JexlContext jcontext = new MapContext();
+
+ jcontext.set("status",
CrawlDatum.getStatusName(datum.getStatus()));
+ jcontext.set("fetchTime", (long) (datum.getFetchTime()));
+ jcontext.set("modifiedTime", (long) (datum.getModifiedTime()));
+ jcontext.set("retries", datum.getRetriesSinceFetch());
+ jcontext.set("interval", new Integer(datum.getFetchInterval()));
+ jcontext.set("score", datum.getScore());
+ jcontext.set("signature",
StringUtil.toHexString(datum.getSignature()));
+ jcontext.set("url", url.toString());
+
+ jcontext.set("text", parse.getText());
+ jcontext.set("title", parse.getData().getTitle());
+
+ JexlContext httpStatusContext = new MapContext();
+ httpStatusContext.set("majorCode",
parse.getData().getStatus().getMajorCode());
+ httpStatusContext.set("minorCode",
parse.getData().getStatus().getMinorCode());
+ httpStatusContext.set("message",
parse.getData().getStatus().getMessage());
+ jcontext.set("httpStatus", httpStatusContext);
+
+ jcontext.set("documentMeta",
metadataToContext(doc.getDocumentMeta()));
+ jcontext.set("contentMeta",
metadataToContext(parse.getData().getContentMeta()));
+ jcontext.set("parseMeta",
metadataToContext(parse.getData().getParseMeta()));
+
+ JexlContext context = new MapContext();
+ for (Entry<String, NutchField> entry : doc) {
+ context.set(entry.getKey(),
entry.getValue().getValues());
+ }
+ jcontext.set("doc", context);
+
+ try {
+ if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
+ return doc;
+ }
+ } catch (Exception e) {
+ LOG.warn("Failed evaluating JEXL {}",
expr.getExpression(), e);
+ }
+ return null;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ String str = conf.get("index.jexl.filter");
+ if (str == null) {
+ LOG.warn(
+ "The property index.jexl.filter must
have a value when index-jexl-filter is used. You can use 'true' or 'false' to
index all/none");
+ throw new RuntimeException(
+ "The property index.jexl.filter must
have a value when index-jexl-filter is used. You can use 'true' or 'false' to
index all/none");
+ }
+ expr = JexlUtil.parseExpression(str);
+ if (expr == null) {
+ LOG.warn("Failed parsing JEXL from index.jexl.filter:
{}", str);
+ throw new RuntimeException("Failed parsing JEXL from
index.jexl.filter");
+ }
+ }
+
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ private JexlContext metadataToContext(Metadata metadata) {
+ JexlContext context = new MapContext();
+ for (String name : metadata.names()) {
+ context.set(name, metadata.getValues(name));
+ }
+ return context;
+ }
+}
diff --git
a/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/filter/TestJexlIndexingFilter.java
b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/filter/TestJexlIndexingFilter.java
new file mode 100644
index 000000000..286d29651
--- /dev/null
+++
b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/filter/TestJexlIndexingFilter.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.filter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+public class TestJexlIndexingFilter {
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ public void testAllowMatchingDocument() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+
+ JexlIndexingFilter filter = new JexlIndexingFilter();
+ filter.setConf(conf);
+ Assert.assertNotNull(filter);
+
+ NutchDocument doc = new NutchDocument();
+
+ String title = "The Foo Page";
+ Outlink[] outlinks = new Outlink[] { new
Outlink("http://foo.com/", "Foo") };
+ Metadata metaData = new Metadata();
+ metaData.add("Language", "en/us");
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
title, outlinks, metaData);
+ ParseImpl parse = new ParseImpl("this is a sample foo bar page.
hope you enjoy it.", parseData);
+
+ CrawlDatum crawlDatum = new CrawlDatum();
+ crawlDatum.setFetchTime(100L);
+
+ Inlinks inlinks = new Inlinks();
+
+ doc.add("lang", "en");
+
+ NutchDocument result = filter.filter(doc, parse, new
Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
+ Assert.assertNotNull(result);
+ Assert.assertEquals(doc, result);
+ }
+
+ @Test
+ public void testBlockNotMatchingDocuments() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+
+ JexlIndexingFilter filter = new JexlIndexingFilter();
+ filter.setConf(conf);
+ Assert.assertNotNull(filter);
+
+ NutchDocument doc = new NutchDocument();
+
+ String title = "The Foo Page";
+ Outlink[] outlinks = new Outlink[] { new
Outlink("http://foo.com/", "Foo") };
+ Metadata metaData = new Metadata();
+ metaData.add("Language", "en/us");
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
title, outlinks, metaData);
+ ParseImpl parse = new ParseImpl("this is a sample foo bar page.
hope you enjoy it.", parseData);
+
+ CrawlDatum crawlDatum = new CrawlDatum();
+ crawlDatum.setFetchTime(100L);
+
+ Inlinks inlinks = new Inlinks();
+
+ doc.add("lang", "ru");
+
+ NutchDocument result = filter.filter(doc, parse, new
Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
+ Assert.assertNull(result);
+ }
+
+ @Test
+ public void testMissingConfiguration() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ JexlIndexingFilter filter = new JexlIndexingFilter();
+ thrown.expect(RuntimeException.class);
+ filter.setConf(conf);
+ }
+
+ @Test
+ public void testInvalidExpression() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("index.jexl.filter", "doc.lang[0]=<>:='en'");
+
+ JexlIndexingFilter filter = new JexlIndexingFilter();
+ thrown.expect(RuntimeException.class);
+ filter.setConf(conf);
+ }
+}
diff --git
a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index dfcc01cf4..77f81d2f5 100644
---
a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++
b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -24,6 +24,7 @@
import java.io.IOException;
import java.net.InetAddress;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
@@ -32,6 +33,7 @@
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.indexer.IndexWriter;
import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BackoffPolicy;
@@ -175,8 +177,14 @@ public void write(NutchDocument doc) throws IOException {
// Add each field of this doc to the index source
Map<String, Object> source = new HashMap<String, Object>();
for (String fieldName : doc.getFieldNames()) {
- if (doc.getFieldValue(fieldName) != null) {
- source.put(fieldName, doc.getFieldValue(fieldName));
+ NutchField field = doc.getField(fieldName);
+ if (field != null) {
+ List<Object> values = field.getValues();
+ if (values.size()==1) {
+ source.put(fieldName, values.get(0));
+ } else if (values.size()>1) {
+ source.put(fieldName, values);
+ }
}
}
diff --git
a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
index fbfe8f978..cd954c70d 100644
---
a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
+++
b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
@@ -27,6 +27,9 @@
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
+import java.util.HashSet;
+import java.util.Set;
+
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
@@ -49,6 +52,7 @@
public class LanguageIndexingFilter implements IndexingFilter {
private Configuration conf;
+ private Set<String> indexLangs;
/**
* Constructs a new Language Indexing Filter.
@@ -73,6 +77,10 @@ public NutchDocument filter(NutchDocument doc, Parse parse,
Text url,
lang = "unknown";
}
+ if (!indexLangs.isEmpty() && !indexLangs.contains(lang)) {
+ return null;
+ }
+
doc.add("lang", lang);
return doc;
@@ -80,6 +88,7 @@ public NutchDocument filter(NutchDocument doc, Parse parse,
Text url,
public void setConf(Configuration conf) {
this.conf = conf;
+ indexLangs = new
HashSet<>(conf.getStringCollection("lang.index.languages"));
}
public Configuration getConf() {
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> indexer-elastic does not index multi-value fields (only the first value is
> indexed)
> -----------------------------------------------------------------------------------
>
> Key: NUTCH-2399
> URL: https://issues.apache.org/jira/browse/NUTCH-2399
> Project: Nutch
> Issue Type: Bug
> Components: indexer
> Reporter: Yossi Tamari
> Fix For: 1.14
>
>
> Currently, if there is a NutchField with multiple values, only the first
> value is indexed (because this is what doc.getFieldValue returns). Pull
> request #200 checks if the NutchField has multiple values, and if so, they
> are added as an array (multivalue) field.
> [https://github.com/apache/nutch/pull/200]
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)