Author: markus
Date: Wed Jul 1 07:13:55 2015
New Revision: 1688569
URL: http://svn.apache.org/r1688569
Log:
NUTCH-1980 Jexl expressions for CrawlDbReader
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688569&r1=1688568&r2=1688569&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 1 07:13:55 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-1980 Jexl expressions for CrawlDbReader (markus)
+
* NUTCH-1692 SegmentReader was broken in distributed mode (markus, tejasp)
* NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus)
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1688569&r1=1688568&r2=1688569&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Jul 1 07:13:55 2015
@@ -80,6 +80,8 @@
<dependency org="com.fasterxml.jackson.dataformat"
name="jackson-dataformat-cbor" rev="2.5.1" />
<dependency org="com.fasterxml.jackson.jaxrs"
name="jackson-jaxrs-json-provider" rev="2.5.1" />
+ <dependency org="org.apache.commons" name="commons-jexl"
rev="2.1.1" />
+
<dependency org="org.apache.mahout" name="mahout-math"
rev="0.8" />
<dependency org="org.apache.mahout" name="mahout-core"
rev="0.8" />
<dependency org="org.apache.lucene" name="lucene-core"
rev="4.3.0" />
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1688569&r1=1688568&r2=1688569&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Jul 1
07:13:55 2015
@@ -21,6 +21,11 @@ import java.io.*;
import java.util.*;
import java.util.Map.Entry;
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.jexl2.MapContext;
+
import org.apache.hadoop.io.*;
import org.apache.nutch.util.*;
@@ -515,4 +520,67 @@ public class CrawlDatum implements Writa
throw new RuntimeException(e);
}
}
-}
+
+ public boolean evaluate(String expr) {
+ return evaluate(expr, true, true);
+ }
+
+ public boolean evaluate(String expr, boolean silent, boolean strict) {
+ if (expr != null) {
+ // Create or retrieve a JexlEngine
+ JexlEngine jexl = new JexlEngine();
+
+ jexl.setSilent(silent);
+ jexl.setStrict(strict);
+
+ // Create an expression object and evaluate
+ return evaluate(jexl.createExpression(expr));
+ }
+
+ return false;
+ }
+
+ public boolean evaluate(Expression expr) {
+ if (expr != null) {
+ // Create a context and add data
+ JexlContext jcontext = new MapContext();
+
+ // Set metadata variables
+ for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
+ Object value = entry.getValue();
+
+ if (value instanceof FloatWritable) {
+ FloatWritable fvalue = (FloatWritable)value;
+ Text tkey = (Text)entry.getKey();
+ jcontext.set(tkey.toString(), fvalue.get());
+ }
+
+ if (value instanceof IntWritable) {
+ IntWritable ivalue = (IntWritable)value;
+ Text tkey = (Text)entry.getKey();
+ jcontext.set(tkey.toString(), ivalue.get());
+ }
+
+ if (value instanceof Text) {
+ Text tvalue = (Text)value;
+ Text tkey = (Text)entry.getKey();
+
+ try {
+ Float number = Float.parseFloat(tvalue.toString());
+ jcontext.set(tkey.toString(), number);
+ } catch (Exception e) {}
+ }
+ }
+
+ try {
+ if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
+ return true;
+ }
+ } catch (Exception e) {
+ //
+ }
+ }
+
+ return false;
+ }
+}
\ No newline at end of file
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1688569&r1=1688568&r2=1688569&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Jul 1
07:13:55 2015
@@ -78,6 +78,9 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.StringUtil;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+
/**
* Read utility for the CrawlDB.
*
@@ -485,7 +488,7 @@ public class CrawlDbReader extends Confi
public void processDumpJob(String crawlDb, String output,
JobConf config, String format, String regex, String status,
- Integer retry) throws IOException {
+ Integer retry, String expr) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb dump: starting");
LOG.info("CrawlDb db: " + crawlDb);
@@ -514,6 +517,8 @@ public class CrawlDbReader extends Confi
job.set("regex", regex);
if (retry != null)
job.setInt("retry", retry);
+ if (expr != null)
+ job.set("expr", expr);
job.setMapperClass(CrawlDbDumpMapper.class);
job.setOutputKeyClass(Text.class);
@@ -531,6 +536,7 @@ public class CrawlDbReader extends Confi
Matcher matcher = null;
String status = null;
Integer retry = null;
+ Expression expr = null;
public void configure(JobConf job) {
if (job.get("regex", null) != null) {
@@ -538,6 +544,13 @@ public class CrawlDbReader extends Confi
}
status = job.get("status", null);
retry = job.getInt("retry", -1);
+
+ if (job.get("expr", null) != null) {
+ JexlEngine jexl = new JexlEngine();
+ jexl.setSilent(true);
+ jexl.setStrict(true);
+ expr = jexl.createExpression(job.get("expr", null));
+ }
}
public void close() {
@@ -567,6 +580,13 @@ public class CrawlDbReader extends Confi
return;
}
}
+
+ // check expr
+ if (expr != null) {
+ if (!value.evaluate(expr)) {
+ return;
+ }
+ }
output.collect(key, value);
}
@@ -650,6 +670,7 @@ public class CrawlDbReader extends Confi
System.err.println("\t\t[-retry <num>]\tminimum retry count");
System.err
.println("\t\t[-status <status>]\tfilter records by CrawlDatum
status");
+ System.err.println("\t\t[-expr <expr>]\tJexl expression to evaluate for
this record");
System.err
.println("\t-url <url>\tprint information on <url> to System.out");
System.err
@@ -676,6 +697,7 @@ public class CrawlDbReader extends Confi
String regex = null;
Integer retry = null;
String status = null;
+ String expr = null;
for (int j = i + 1; j < args.length; j++) {
if (args[j].equals("-format")) {
format = args[++j];
@@ -693,8 +715,12 @@ public class CrawlDbReader extends Confi
status = args[++j];
i = i + 2;
}
+ if (args[j].equals("-expr")) {
+ expr = args[++j];
+ i=i+2;
+ }
}
- dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry);
+ dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry,
expr);
} else if (args[i].equals("-url")) {
param = args[++i];
dbr.readUrl(crawlDb, param, job);
@@ -783,6 +809,7 @@ public class CrawlDbReader extends Confi
String regex = null;
Integer retry = null;
String status = null;
+ String expr = null;
if (args.containsKey("format")) {
format = args.get("format");
}
@@ -795,7 +822,10 @@ public class CrawlDbReader extends Confi
if (args.containsKey("status")) {
status = args.get("status");
}
- processDumpJob(crawlDb, output, new NutchJob(conf), format, regex,
status, retry);
+ if (args.containsKey("expr")) {
+ expr = args.get("expr");
+ }
+ processDumpJob(crawlDb, output, new NutchJob(conf), format, regex,
status, retry, expr);
File dumpFile = new File(output+"/part-00000");
return dumpFile;
}