This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 7d3900450 NUTCH-2924 Generate maxCount expr evaluated only once
7d3900450 is described below

commit 7d390045049036541d2fd94302ab97c8cb3e3cb1
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Mon Dec 12 16:13:40 2022 +0100

    NUTCH-2924 Generate maxCount expr evaluated only once
---
 src/java/org/apache/nutch/crawl/Generator.java | 103 +++++++++++--------------
 1 file changed, 44 insertions(+), 59 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/Generator.java 
b/src/java/org/apache/nutch/crawl/Generator.java
index 0fce6b3b0..8a2f87ba4 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -311,27 +311,30 @@ public class Generator extends NutchTool implements Tool {
     private SequenceFile.Reader[] hostdbReaders = null;
     private JexlScript maxCountExpr = null;
     private JexlScript fetchDelayExpr = null;
-
-    public void open() {
-      if (conf.get(GENERATOR_HOSTDB) != null) {
-        try {
-          Path path = new Path(conf.get(GENERATOR_HOSTDB), "current");
-          hostdbReaders = SegmentReaderUtil.getReaders(path, conf);
-        } catch (IOException e) {
-          LOG.error("Error reading HostDB because {}", e.getMessage());
-        }
+    private Map<String, HostDatum> hostDatumCache = new HashMap<>();
+    
+    public void readHostDb() throws IOException {
+      if (conf.get(GENERATOR_HOSTDB) == null) {
+        return;
       }
-    }
-
-    public void close() {
-      if (hostdbReaders != null) {
-        try {
-          for (int i = 0; i < hostdbReaders.length; i++) {
-            hostdbReaders[i].close();
+      
+      Path path = new Path(conf.get(GENERATOR_HOSTDB), "current");
+      hostdbReaders = SegmentReaderUtil.getReaders(path, conf);
+      
+      try {
+        Text key = new Text();
+        HostDatum value = new HostDatum();
+        for (int i = 0; i < hostdbReaders.length; i++) {
+          while (hostdbReaders[i].next(key, value)) {
+            hostDatumCache.put(key.toString(), (HostDatum)value.clone());
           }
-        } catch (IOException e) {
-          LOG.error("Error closing HostDB because {}", e.getMessage());
         }
+      } catch (Exception e) {
+        throw new IOException(e);
+      }
+      
+      for (int i = 0; i < hostdbReaders.length; i++) {
+        hostdbReaders[i].close();
       }
     }
 
@@ -402,6 +405,8 @@ public class Generator extends NutchTool implements Tool {
         fetchDelayExpr = JexlUtil
             .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
       }
+      
+      readHostDb();
     }
 
     @Override
@@ -414,7 +419,7 @@ public class Generator extends NutchTool implements Tool {
     public void reduce(FloatWritable key, Iterable<SelectorEntry> values,
         Context context) throws IOException, InterruptedException {
 
-      String hostname = null;
+      String currentHostname = null;
       HostDatum host = null;
       LongWritable variableFetchDelayWritable = null; // in millis
       Text variableFetchDelayKey = new Text("_variableFetchDelay_");
@@ -425,33 +430,31 @@ public class Generator extends NutchTool implements Tool {
         String urlString = url.toString();
         URL u = null;
 
-        // Do this only once per queue
-        if (host == null) {
-          try {
-            hostname = URLUtil.getHost(urlString);
-            host = getHostDatum(hostname);
-          } catch (Exception e) {
-          }
+        String hostname = URLUtil.getHost(urlString);
+        if (!hostname.equals(currentHostname)) {
+          currentHostname = hostname;
+          host = hostDatumCache.get(hostname);
 
           // Got it?
-          if (host == null) {
-            // Didn't work, prevent future lookups
-            host = new HostDatum();
-          } else {
+          if (host != null) {
             if (maxCountExpr != null) {
-              long variableMaxCount = Math
-                  .round((double) maxCountExpr.execute(createContext(host)));
-              LOG.info("Generator: variable maxCount: {} for {}",
-                  variableMaxCount, hostname);
-              maxCount = (int) variableMaxCount;
+              try {
+                long variableMaxCount = 
Math.round((double)maxCountExpr.execute(createContext(host)));
+                LOG.debug("Generator: variable maxCount: {} for {}", 
variableMaxCount, hostname);
+                maxCount = (int)variableMaxCount;
+              } catch (Exception e) {
+                LOG.error("Unable to execute variable maxCount expression 
because: " + e.getMessage(), e);
+              }
             }
 
             if (fetchDelayExpr != null) {
-              long variableFetchDelay = Math
-                  .round((double) fetchDelayExpr.execute(createContext(host)));
-              LOG.info("Generator: variable fetchDelay: {} ms for {}",
-                  variableFetchDelay, hostname);
-              variableFetchDelayWritable = new 
LongWritable(variableFetchDelay);
+              try {
+                long variableFetchDelay = 
Math.round((double)fetchDelayExpr.execute(createContext(host)));
+                LOG.debug("Generator: variable fetchDelay: {} ms for {}", 
variableFetchDelay, hostname);
+                variableFetchDelayWritable = new 
LongWritable(variableFetchDelay);
+              } catch (Exception e) {
+                LOG.error("Unable to execute fetch delay expression because: " 
+ e.getMessage(), e);
+              }
             }
           }
         }
@@ -551,24 +554,6 @@ public class Generator extends NutchTool implements Tool {
     private String generateFileName(SelectorEntry entry) {
       return "fetchlist-" + entry.segnum.toString() + "/part";
     }
-
-    private HostDatum getHostDatum(String host) throws Exception {
-      Text key = new Text();
-      HostDatum value = new HostDatum();
-
-      open();
-      for (int i = 0; i < hostdbReaders.length; i++) {
-        while (hostdbReaders[i].next(key, value)) {
-          if (host.equals(key.toString())) {
-            close();
-            return value;
-          }
-        }
-      }
-
-      close();
-      return null;
-    }
   }
 
   public static class DecreasingFloatComparator
@@ -1072,7 +1057,7 @@ public class Generator extends NutchTool implements Tool {
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
       System.out.println(
-          "Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] 
[-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] 
[-noNorm] [-maxNumSegments <num>]");
+          "Usage: Generator <crawldb> <segments_dir> [-hostdb <hostdb>] 
[-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays 
<numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]");
       return -1;
     }
 

Reply via email to