This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 22fc7f0defb22588c4ade33b5693303f18d96253 Author: Sebastian Nagel <[email protected]> AuthorDate: Sun Dec 17 15:32:04 2017 +0100 NUTCH-2322 URL not available for Jexl operations - apply patch contributed by Markus Jelsma --- src/java/org/apache/nutch/crawl/CrawlDatum.java | 18 ++++++++++++------ src/java/org/apache/nutch/crawl/CrawlDbReader.java | 2 +- src/java/org/apache/nutch/crawl/Generator.java | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java index e54c791..1facf0a 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDatum.java +++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java @@ -23,14 +23,15 @@ import java.util.Map.Entry; import org.apache.commons.jexl2.JexlContext; import org.apache.commons.jexl2.Expression; -import org.apache.commons.jexl2.JexlEngine; import org.apache.commons.jexl2.MapContext; import org.apache.hadoop.io.*; import org.apache.nutch.util.*; +import org.apache.nutch.protocol.ProtocolStatus; /* The crawl state of a url. */ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable { + public static final String GENERATE_DIR_NAME = "crawl_generate"; public static final String FETCH_DIR_NAME = "crawl_fetch"; public static final String PARSE_DIR_NAME = "crawl_parse"; @@ -525,12 +526,13 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable { } } - public boolean evaluate(Expression expr) { - if (expr != null) { + public boolean evaluate(Expression expr, String url) { + if (expr != null && url != null) { // Create a context and add data JexlContext jcontext = new MapContext(); // https://issues.apache.org/jira/browse/NUTCH-2229 + jcontext.set("url", url); jcontext.set("status", getStatusName(getStatus())); jcontext.set("fetchTime", (long)(getFetchTime())); jcontext.set("modifiedTime", (long)(getModifiedTime())); @@ -542,24 +544,28 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable { // Set metadata variables for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) { Object value = entry.getValue(); + Text tkey = (Text)entry.getKey(); if (value instanceof FloatWritable) { FloatWritable fvalue = (FloatWritable)value; - Text tkey = (Text)entry.getKey(); jcontext.set(tkey.toString(), fvalue.get()); } if (value instanceof IntWritable) { IntWritable ivalue = (IntWritable)value; - Text tkey = (Text)entry.getKey(); jcontext.set(tkey.toString(), ivalue.get()); } if (value instanceof Text) { Text tvalue = (Text)value; - Text tkey = (Text)entry.getKey(); jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString()); } + + if (value instanceof ProtocolStatus) { + ProtocolStatus pvalue = (ProtocolStatus)value; + jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString()); + } + } try { diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index af30664..ddd25ef 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -700,7 +700,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool { // check expr if (expr != null) { - if (!value.evaluate(expr)) { + if (!value.evaluate(expr, key.toString())) { return; } } diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index e5f4831..d85d578 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -252,7 +252,7 @@ public class Generator extends NutchTool implements Tool { // check expr if (expr != null) { - if (!crawlDatum.evaluate(expr)) { + if (!crawlDatum.evaluate(expr, key.toString())) { return; } } -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
