Author: markus
Date: Wed Jul  1 07:13:55 2015
New Revision: 1688569

URL: http://svn.apache.org/r1688569
Log:
NUTCH-1980 Jexl expressions for CrawlDbReader

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688569&r1=1688568&r2=1688569&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul  1 07:13:55 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1980 Jexl expressions for CrawlDbReader (markus)
+
 * NUTCH-1692 SegmentReader was broken in distributed mode (markus, tejasp)
 
 * NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus)

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1688569&r1=1688568&r2=1688569&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Jul  1 07:13:55 2015
@@ -80,6 +80,8 @@
                 <dependency org="com.fasterxml.jackson.dataformat" 
name="jackson-dataformat-cbor" rev="2.5.1" />
                 <dependency org="com.fasterxml.jackson.jaxrs" 
name="jackson-jaxrs-json-provider" rev="2.5.1" />        
 
+                <dependency org="org.apache.commons" name="commons-jexl" 
rev="2.1.1" />
+
                  <dependency org="org.apache.mahout" name="mahout-math" 
rev="0.8" />
                  <dependency org="org.apache.mahout" name="mahout-core" 
rev="0.8" />
                  <dependency org="org.apache.lucene" name="lucene-core" 
rev="4.3.0" />

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1688569&r1=1688568&r2=1688569&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Jul  1 
07:13:55 2015
@@ -21,6 +21,11 @@ import java.io.*;
 import java.util.*;
 import java.util.Map.Entry;
 
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.jexl2.MapContext;
+
 import org.apache.hadoop.io.*;
 import org.apache.nutch.util.*;
 
@@ -515,4 +520,67 @@ public class CrawlDatum implements Writa
       throw new RuntimeException(e);
     }
   }
-}
+  
+  public boolean evaluate(String expr) {
+    return evaluate(expr, true, true);
+  }
+  
+  public boolean evaluate(String expr, boolean silent, boolean strict) {
+    if (expr != null) {
+      // Create or retrieve a JexlEngine
+      JexlEngine jexl = new JexlEngine();
+      
+      jexl.setSilent(silent);
+      jexl.setStrict(strict);
+      
+      // Create an expression object and evaluate
+      return evaluate(jexl.createExpression(expr));
+    }
+    
+    return false;
+  }
+  
+  public boolean evaluate(Expression expr) {
+    if (expr != null) {
+      // Create a context and add data
+      JexlContext jcontext = new MapContext();
+            
+      // Set metadata variables
+      for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
+        Object value = entry.getValue();
+        
+        if (value instanceof FloatWritable) {
+          FloatWritable fvalue = (FloatWritable)value;
+          Text tkey = (Text)entry.getKey();
+          jcontext.set(tkey.toString(), fvalue.get());
+        }
+        
+        if (value instanceof IntWritable) {
+          IntWritable ivalue = (IntWritable)value;
+          Text tkey = (Text)entry.getKey();
+          jcontext.set(tkey.toString(), ivalue.get());
+        }
+        
+        if (value instanceof Text) {
+          Text tvalue = (Text)value;
+          Text tkey = (Text)entry.getKey();
+          
+          try {
+            Float number = Float.parseFloat(tvalue.toString());
+            jcontext.set(tkey.toString(), number);
+          } catch (Exception e) {}
+        }
+      }
+      
+      try {
+        if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
+          return true;
+        }
+      } catch (Exception e) {
+        //
+      }
+    }
+
+    return false;
+  }
+}
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1688569&r1=1688568&r2=1688569&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Jul  1 
07:13:55 2015
@@ -78,6 +78,9 @@ import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.StringUtil;
 
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+
 /**
  * Read utility for the CrawlDB.
  * 
@@ -485,7 +488,7 @@ public class CrawlDbReader extends Confi
 
   public void processDumpJob(String crawlDb, String output,
       JobConf config, String format, String regex, String status,
-      Integer retry) throws IOException {
+      Integer retry, String expr) throws IOException {
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb dump: starting");
       LOG.info("CrawlDb db: " + crawlDb);
@@ -514,6 +517,8 @@ public class CrawlDbReader extends Confi
       job.set("regex", regex);
     if (retry != null)
       job.setInt("retry", retry);
+    if (expr != null)
+      job.set("expr", expr);
 
     job.setMapperClass(CrawlDbDumpMapper.class);
     job.setOutputKeyClass(Text.class);
@@ -531,6 +536,7 @@ public class CrawlDbReader extends Confi
     Matcher matcher = null;
     String status = null;
     Integer retry = null;
+    Expression expr = null;
 
     public void configure(JobConf job) {
       if (job.get("regex", null) != null) {
@@ -538,6 +544,13 @@ public class CrawlDbReader extends Confi
       }
       status = job.get("status", null);
       retry = job.getInt("retry", -1);
+
+      if (job.get("expr", null) != null) {
+        JexlEngine jexl = new JexlEngine();
+        jexl.setSilent(true);
+        jexl.setStrict(true);
+        expr = jexl.createExpression(job.get("expr", null));
+      }
     }
 
     public void close() {
@@ -567,6 +580,13 @@ public class CrawlDbReader extends Confi
           return;
         }
       }
+      
+      // check expr
+      if (expr != null) {
+        if (!value.evaluate(expr)) {
+          return;
+        }
+      }
 
       output.collect(key, value);
     }
@@ -650,6 +670,7 @@ public class CrawlDbReader extends Confi
       System.err.println("\t\t[-retry <num>]\tminimum retry count");
       System.err
           .println("\t\t[-status <status>]\tfilter records by CrawlDatum 
status");
+      System.err.println("\t\t[-expr <expr>]\tJexl expression to evaluate for 
this record");
       System.err
           .println("\t-url <url>\tprint information on <url> to System.out");
       System.err
@@ -676,6 +697,7 @@ public class CrawlDbReader extends Confi
         String regex = null;
         Integer retry = null;
         String status = null;
+        String expr = null;
         for (int j = i + 1; j < args.length; j++) {
           if (args[j].equals("-format")) {
             format = args[++j];
@@ -693,8 +715,12 @@ public class CrawlDbReader extends Confi
             status = args[++j];
             i = i + 2;
           }
+          if (args[j].equals("-expr")) {
+            expr = args[++j];
+            i=i+2;
+          }
         }
-        dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry);
+        dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry, 
expr);
       } else if (args[i].equals("-url")) {
         param = args[++i];
         dbr.readUrl(crawlDb, param, job);
@@ -783,6 +809,7 @@ public class CrawlDbReader extends Confi
       String regex = null;
       Integer retry = null;
       String status = null;
+      String expr = null;
       if (args.containsKey("format")) {
         format = args.get("format");
       }
@@ -795,7 +822,10 @@ public class CrawlDbReader extends Confi
       if (args.containsKey("status")) {
         status = args.get("status");
       }
-      processDumpJob(crawlDb, output, new NutchJob(conf), format, regex, 
status, retry);
+      if (args.containsKey("expr")) {
+        expr = args.get("expr");
+      }
+      processDumpJob(crawlDb, output, new NutchJob(conf), format, regex, 
status, retry, expr);
       File dumpFile = new File(output+"/part-00000");
       return dumpFile;           
     }


Reply via email to