[ 
https://issues.apache.org/jira/browse/NUTCH-2415?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16295147#comment-16295147
 ] 

ASF GitHub Bot commented on NUTCH-2415:
---------------------------------------

sebastian-nagel closed pull request #219: NUTCH-2415 : Create a JEXL based 
IndexingFilter
URL: https://github.com/apache/nutch/pull/219
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/build.xml b/build.xml
index 47c2a2ede..56be49533 100644
--- a/build.xml
+++ b/build.xml
@@ -179,6 +179,7 @@
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
+      <packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
       <packageset dir="${plugins.dir}/index-links/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
@@ -630,6 +631,7 @@
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
+      <packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
       <packageset dir="${plugins.dir}/index-links/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
@@ -1040,6 +1042,8 @@
         <source path="${plugins.dir}/index-basic/src/java/" />
         <source path="${plugins.dir}/index-basic/src/test/" />
         <source path="${plugins.dir}/index-geoip/src/java/" />
+        <source path="${plugins.dir}/index-jexl-filter/src/java/" />
+        <source path="${plugins.dir}/index-jexl-filter/src/test/" />
         <source path="${plugins.dir}/index-links/src/java/" />
         <source path="${plugins.dir}/index-links/src/test/" />
         <source path="${plugins.dir}/index-metadata/src/java/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index e68b0dd84..5e8606fe4 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1618,6 +1618,24 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
+<!-- index-jexl-filter plugin properties -->
+
+<property>
+  <name>index.jexl.filter</name>
+  <value></value>
+  <description> A JEXL expression. If it evaluates to false,
+  the document will not be indexed.
+  Available primitives in the JEXL context:
+  * status, fetchTime, modifiedTime, retries, interval, score, signature, url, 
text, title
+  Available objects in the JEXL context:
+  * httpStatus - contains majorCode, minorCode, message
+  * documentMeta, contentMeta, parseMeta - contain all the Metadata properties.
+    each property value is always an array of Strings (so if you expect one 
value, use [0])
+  * doc - contains all the NutchFields from the NutchDocument.
+    each property value is always an array of Objects.
+  </description>
+</property>
+
 <!-- index-static plugin properties -->
 
 <property>
diff --git a/default.properties b/default.properties
index 6b7a6ab79..c057518d8 100644
--- a/default.properties
+++ b/default.properties
@@ -170,6 +170,7 @@ plugins.index=\
    org.apache.nutch.indexer.basic*:\
    org.apache.nutch.indexer.feed*:\
    org.apache.nutch.indexer.geoip*:\
+   org.apache.nutch.indexer.jexl*:\
    org.apache.nutch.indexer.filter*:\
    org.apache.nutch.indexer.links*:\
    org.apache.nutch.indexer.metadata*:\
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 5402d036c..5052082cd 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -40,6 +40,7 @@
     <ant dir="index-anchor" target="deploy"/>
     <ant dir="index-basic" target="deploy"/>
     <ant dir="index-geoip" target="deploy"/>
+    <ant dir="index-jexl-filter" target="deploy"/>
     <ant dir="index-links" target="deploy"/>
     <ant dir="index-metadata" target="deploy"/>
     <ant dir="index-more" target="deploy"/>
@@ -159,6 +160,7 @@
     <ant dir="index-anchor" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-geoip" target="clean"/>
+    <ant dir="index-jexl-filter" target="clean"/>
     <ant dir="index-links" target="clean"/>
     <ant dir="index-metadata" target="clean"/>
     <ant dir="index-more" target="clean"/>
diff --git a/src/plugin/index-jexl-filter/build.xml 
b/src/plugin/index-jexl-filter/build.xml
new file mode 100644
index 000000000..7aa7be24d
--- /dev/null
+++ b/src/plugin/index-jexl-filter/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-jexl-filter" default="jar-core">
+
+    <import file="../build-plugin.xml" />
+
+ </project>
diff --git a/src/plugin/index-jexl-filter/ivy.xml 
b/src/plugin/index-jexl-filter/ivy.xml
new file mode 100644
index 000000000..0a363f774
--- /dev/null
+++ b/src/plugin/index-jexl-filter/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/index-jexl-filter/plugin.xml 
b/src/plugin/index-jexl-filter/plugin.xml
new file mode 100644
index 000000000..a24a0c95f
--- /dev/null
+++ b/src/plugin/index-jexl-filter/plugin.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-jexl-filter"
+   name="Filter indexed documents according to a JEXL expression"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="index-jexl-filter.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+    <extension id="org.apache.nutch.indexer.jexl"
+               name="Nutch JEXL filter"
+               point="org.apache.nutch.indexer.IndexingFilter">
+        <implementation id="JexlIndexingFilter"
+                        
class="org.apache.nutch.indexer.jexl.JexlIndexingFilter"/>
+    </extension>
+
+</plugin>
diff --git 
a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
 
b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
new file mode 100644
index 000000000..24284a67b
--- /dev/null
+++ 
b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.jexl;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Map.Entry;
+
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.MapContext;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.JexlUtil;
+import org.apache.nutch.util.StringUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering of
+ * documents based on a JEXL expression.
+ *
+ */
+public class JexlIndexingFilter implements IndexingFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private Configuration conf;
+  private Expression expr;
+
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    // Create a context and add data
+    JexlContext jcontext = new MapContext();
+
+    jcontext.set("status", CrawlDatum.getStatusName(datum.getStatus()));
+    jcontext.set("fetchTime", (long) (datum.getFetchTime()));
+    jcontext.set("modifiedTime", (long) (datum.getModifiedTime()));
+    jcontext.set("retries", datum.getRetriesSinceFetch());
+    jcontext.set("interval", new Integer(datum.getFetchInterval()));
+    jcontext.set("score", datum.getScore());
+    jcontext.set("signature", StringUtil.toHexString(datum.getSignature()));
+    jcontext.set("url", url.toString());
+
+    jcontext.set("text", parse.getText());
+    jcontext.set("title", parse.getData().getTitle());
+
+    JexlContext httpStatusContext = new MapContext();
+    httpStatusContext.set("majorCode",
+        parse.getData().getStatus().getMajorCode());
+    httpStatusContext.set("minorCode",
+        parse.getData().getStatus().getMinorCode());
+    httpStatusContext.set("message", parse.getData().getStatus().getMessage());
+    jcontext.set("httpStatus", httpStatusContext);
+
+    jcontext.set("documentMeta", metadataToContext(doc.getDocumentMeta()));
+    jcontext.set("contentMeta",
+        metadataToContext(parse.getData().getContentMeta()));
+    jcontext.set("parseMeta",
+        metadataToContext(parse.getData().getParseMeta()));
+
+    JexlContext context = new MapContext();
+    for (Entry<String, NutchField> entry : doc) {
+      context.set(entry.getKey(), entry.getValue().getValues());
+    }
+    jcontext.set("doc", context);
+
+    try {
+      if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
+        return doc;
+      }
+    } catch (Exception e) {
+      LOG.warn("Failed evaluating JEXL {}", expr.getExpression(), e);
+    }
+    return null;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String str = conf.get("index.jexl.filter");
+    if (str == null) {
+      LOG.warn(
+          "The property index.jexl.filter must have a value when 
index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
+      throw new RuntimeException(
+          "The property index.jexl.filter must have a value when 
index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
+    }
+    expr = JexlUtil.parseExpression(str);
+    if (expr == null) {
+      LOG.warn("Failed parsing JEXL from index.jexl.filter: {}", str);
+      throw new RuntimeException("Failed parsing JEXL from index.jexl.filter");
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  private JexlContext metadataToContext(Metadata metadata) {
+    JexlContext context = new MapContext();
+    for (String name : metadata.names()) {
+      context.set(name, metadata.getValues(name));
+    }
+    return context;
+  }
+}
diff --git 
a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/package-info.java
 
b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/package-info.java
new file mode 100644
index 000000000..809f716a1
--- /dev/null
+++ 
b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/package-info.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>This plugin implements a dynamic indexing filter which uses JEXL 
+ * expressions to allow filtering based on the page's metadata 
+ * <p>Available primitives in the JEXL context:<ul>
+  * <li>status, fetchTime, modifiedTime, retries, interval, score, signature, 
url, text, title</li></ul>
+ * <p>Available objects in the JEXL context:<ul>
+ * <li>httpStatus - contains majorCode, minorCode, message</li>
+ * <li>documentMeta, contentMeta, parseMeta - contain all the Metadata 
properties.<br>
+ *   Each property value is always an array of Strings (so if you expect one 
value, use [0])</li>
+ * <li>doc - contains all the NutchFields from the NutchDocument.<br>
+ *   Each property value is always an array of Objects.</li></ul>
+ * 
+ */
+package org.apache.nutch.indexer.jexl;
\ No newline at end of file
diff --git 
a/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
 
b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
new file mode 100644
index 000000000..0427ad495
--- /dev/null
+++ 
b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.jexl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+public class TestJexlIndexingFilter {
+  @Rule
+  public ExpectedException thrown = ExpectedException.none();
+
+  @Test
+  public void testAllowMatchingDocument() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+
+    JexlIndexingFilter filter = new JexlIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+
+    NutchDocument doc = new NutchDocument();
+
+    String title = "The Foo Page";
+    Outlink[] outlinks = new Outlink[] {
+        new Outlink("http://foo.com/";, "Foo") };
+    Metadata metaData = new Metadata();
+    metaData.add("Language", "en/us");
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+        outlinks, metaData);
+    ParseImpl parse = new ParseImpl(
+        "this is a sample foo bar page. hope you enjoy it.", parseData);
+
+    CrawlDatum crawlDatum = new CrawlDatum();
+    crawlDatum.setFetchTime(100L);
+
+    Inlinks inlinks = new Inlinks();
+
+    doc.add("lang", "en");
+
+    NutchDocument result = filter.filter(doc, parse,
+        new Text("http://nutch.apache.org/index.html";), crawlDatum, inlinks);
+    Assert.assertNotNull(result);
+    Assert.assertEquals(doc, result);
+  }
+
+  @Test
+  public void testBlockNotMatchingDocuments() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+
+    JexlIndexingFilter filter = new JexlIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+
+    NutchDocument doc = new NutchDocument();
+
+    String title = "The Foo Page";
+    Outlink[] outlinks = new Outlink[] {
+        new Outlink("http://foo.com/";, "Foo") };
+    Metadata metaData = new Metadata();
+    metaData.add("Language", "en/us");
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+        outlinks, metaData);
+    ParseImpl parse = new ParseImpl(
+        "this is a sample foo bar page. hope you enjoy it.", parseData);
+
+    CrawlDatum crawlDatum = new CrawlDatum();
+    crawlDatum.setFetchTime(100L);
+
+    Inlinks inlinks = new Inlinks();
+
+    doc.add("lang", "ru");
+
+    NutchDocument result = filter.filter(doc, parse,
+        new Text("http://nutch.apache.org/index.html";), crawlDatum, inlinks);
+    Assert.assertNull(result);
+  }
+
+  @Test
+  public void testMissingConfiguration() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    JexlIndexingFilter filter = new JexlIndexingFilter();
+    thrown.expect(RuntimeException.class);
+    filter.setConf(conf);
+  }
+
+  @Test
+  public void testInvalidExpression() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.set("index.jexl.filter", "doc.lang[0]=<>:='en'");
+
+    JexlIndexingFilter filter = new JexlIndexingFilter();
+    thrown.expect(RuntimeException.class);
+    filter.setConf(conf);
+  }
+}


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Create a JEXL based IndexingFilter
> ----------------------------------
>
>                 Key: NUTCH-2415
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2415
>             Project: Nutch
>          Issue Type: New Feature
>          Components: plugin
>    Affects Versions: 1.13
>            Reporter: Yossi Tamari
>            Assignee: Jorge Luis Betancourt Gonzalez
>            Priority: Minor
>             Fix For: 1.14
>
>
> Following on NUTCH-2414 and NUTCH-2412, the requirement was raised for a 
> IndexingFilter plugin which will decide whether to index a document based on 
> a JEXL expression.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to