Author: markus Date: Wed Feb 24 13:05:02 2016 New Revision: 1732140 URL: http://svn.apache.org/viewvc?rev=1732140&view=rev Log: NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732140&r1=1732139&r2=1732140&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Feb 24 13:05:02 2016 @@ -10,6 +10,8 @@ in the release announcement and keep it Nutch Change Log +* NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus) + * NUTCH-2227 RegexParseFilter (markus) * NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732140&r1=1732139&r2=1732140&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 13:05:02 2016 @@ -521,30 +521,20 @@ public class CrawlDatum implements Writa } } - public boolean evaluate(String expr) { - return evaluate(expr, true, true); - } - - public boolean evaluate(String expr, boolean silent, boolean strict) { - if (expr != null) { - // Create or retrieve a JexlEngine - JexlEngine jexl = new JexlEngine(); - - jexl.setSilent(silent); - jexl.setStrict(strict); - - // Create an expression object and evaluate - return evaluate(jexl.createExpression(expr)); - } - - return false; - } - public boolean evaluate(Expression expr) { if (expr != null) { // Create a context and add data JexlContext jcontext = new MapContext(); - + + // https://issues.apache.org/jira/browse/NUTCH-2229 + jcontext.set("status", getStatusName(getStatus())); + jcontext.set("fetchTime", (long)(getFetchTime())); + jcontext.set("modifiedTime", (long)(getModifiedTime())); + jcontext.set("retries", getRetriesSinceFetch()); + jcontext.set("interval", new Integer(getFetchInterval())); + jcontext.set("score", getScore()); + jcontext.set("signature", StringUtil.toHexString(getSignature())); + // Set metadata variables for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) { Object value = entry.getValue(); @@ -571,7 +561,7 @@ public class CrawlDatum implements Writa } catch (Exception e) {} } } - + try { if (Boolean.TRUE.equals(expr.evaluate(jcontext))) { return true; Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732140&r1=1732139&r2=1732140&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 13:05:02 2016 @@ -70,6 +70,7 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; +import org.apache.commons.lang.time.DateUtils; /** * Read utility for the CrawlDB. @@ -522,6 +523,7 @@ public class CrawlDbReader extends Confi public static class CrawlDbDumpMapper implements Mapper<Text, CrawlDatum, Text, CrawlDatum> { + Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"); Pattern pattern = null; Matcher matcher = null; String status = null; @@ -534,12 +536,30 @@ public class CrawlDbReader extends Confi } status = job.get("status", null); retry = job.getInt("retry", -1); - + String exprStr = job.get("expr", null); + if (job.get("expr", null) != null) { - JexlEngine jexl = new JexlEngine(); - jexl.setSilent(true); - jexl.setStrict(true); - expr = jexl.createExpression(job.get("expr", null)); + try { + // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z + Matcher matcher = datePattern.matcher(exprStr); + if (matcher.find()) { + String date = matcher.group(); + + // Parse the thing and get epoch! + Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"}); + long time = parsedDate.getTime(); + + // Replace in the original expression + exprStr = exprStr.replace(date, Long.toString(time)); + } + + JexlEngine jexl = new JexlEngine(); + jexl.setSilent(true); + jexl.setStrict(true); + expr = jexl.createExpression(exprStr); + } catch (Exception e) { + LOG.error(e.getMessage()); + } } }