This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 8b3412a NUTCH-2322 URL not available for Jexl operations - apply
patch contributed by Markus Jelsma
8b3412a is described below
commit 8b3412a864b2bed1bdd633710ffa36b975d8e6bc
Author: Sebastian Nagel <[email protected]>
AuthorDate: Sun Dec 17 15:32:04 2017 +0100
NUTCH-2322 URL not available for Jexl operations
- apply patch contributed by Markus Jelsma
---
src/java/org/apache/nutch/crawl/CrawlDatum.java | 18 ++++++++++++------
src/java/org/apache/nutch/crawl/CrawlDbReader.java | 2 +-
src/java/org/apache/nutch/crawl/Generator.java | 2 +-
3 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java
b/src/java/org/apache/nutch/crawl/CrawlDatum.java
index e54c791..1facf0a 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDatum.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -23,14 +23,15 @@ import java.util.Map.Entry;
import org.apache.commons.jexl2.JexlContext;
import org.apache.commons.jexl2.Expression;
-import org.apache.commons.jexl2.JexlEngine;
import org.apache.commons.jexl2.MapContext;
import org.apache.hadoop.io.*;
import org.apache.nutch.util.*;
+import org.apache.nutch.protocol.ProtocolStatus;
/* The crawl state of a url. */
public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
+
public static final String GENERATE_DIR_NAME = "crawl_generate";
public static final String FETCH_DIR_NAME = "crawl_fetch";
public static final String PARSE_DIR_NAME = "crawl_parse";
@@ -525,12 +526,13 @@ public class CrawlDatum implements
WritableComparable<CrawlDatum>, Cloneable {
}
}
- public boolean evaluate(Expression expr) {
- if (expr != null) {
+ public boolean evaluate(Expression expr, String url) {
+ if (expr != null && url != null) {
// Create a context and add data
JexlContext jcontext = new MapContext();
// https://issues.apache.org/jira/browse/NUTCH-2229
+ jcontext.set("url", url);
jcontext.set("status", getStatusName(getStatus()));
jcontext.set("fetchTime", (long)(getFetchTime()));
jcontext.set("modifiedTime", (long)(getModifiedTime()));
@@ -542,24 +544,28 @@ public class CrawlDatum implements
WritableComparable<CrawlDatum>, Cloneable {
// Set metadata variables
for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
Object value = entry.getValue();
+ Text tkey = (Text)entry.getKey();
if (value instanceof FloatWritable) {
FloatWritable fvalue = (FloatWritable)value;
- Text tkey = (Text)entry.getKey();
jcontext.set(tkey.toString(), fvalue.get());
}
if (value instanceof IntWritable) {
IntWritable ivalue = (IntWritable)value;
- Text tkey = (Text)entry.getKey();
jcontext.set(tkey.toString(), ivalue.get());
}
if (value instanceof Text) {
Text tvalue = (Text)value;
- Text tkey = (Text)entry.getKey();
jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
}
+
+ if (value instanceof ProtocolStatus) {
+ ProtocolStatus pvalue = (ProtocolStatus)value;
+ jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString());
+ }
+
}
try {
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index af30664..ddd25ef 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -700,7 +700,7 @@ public class CrawlDbReader extends Configured implements
Closeable, Tool {
// check expr
if (expr != null) {
- if (!value.evaluate(expr)) {
+ if (!value.evaluate(expr, key.toString())) {
return;
}
}
diff --git a/src/java/org/apache/nutch/crawl/Generator.java
b/src/java/org/apache/nutch/crawl/Generator.java
index e5f4831..d85d578 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -252,7 +252,7 @@ public class Generator extends NutchTool implements Tool {
// check expr
if (expr != null) {
- if (!crawlDatum.evaluate(expr)) {
+ if (!crawlDatum.evaluate(expr, key.toString())) {
return;
}
}
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].