This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 7d3900450 NUTCH-2924 Generate maxCount expr evaluated only once 7d3900450 is described below commit 7d390045049036541d2fd94302ab97c8cb3e3cb1 Author: Markus Jelsma <mar...@apache.org> AuthorDate: Mon Dec 12 16:13:40 2022 +0100 NUTCH-2924 Generate maxCount expr evaluated only once --- src/java/org/apache/nutch/crawl/Generator.java | 103 +++++++++++-------------- 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 0fce6b3b0..8a2f87ba4 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -311,27 +311,30 @@ public class Generator extends NutchTool implements Tool { private SequenceFile.Reader[] hostdbReaders = null; private JexlScript maxCountExpr = null; private JexlScript fetchDelayExpr = null; - - public void open() { - if (conf.get(GENERATOR_HOSTDB) != null) { - try { - Path path = new Path(conf.get(GENERATOR_HOSTDB), "current"); - hostdbReaders = SegmentReaderUtil.getReaders(path, conf); - } catch (IOException e) { - LOG.error("Error reading HostDB because {}", e.getMessage()); - } + private Map<String, HostDatum> hostDatumCache = new HashMap<>(); + + public void readHostDb() throws IOException { + if (conf.get(GENERATOR_HOSTDB) == null) { + return; } - } - - public void close() { - if (hostdbReaders != null) { - try { - for (int i = 0; i < hostdbReaders.length; i++) { - hostdbReaders[i].close(); + + Path path = new Path(conf.get(GENERATOR_HOSTDB), "current"); + hostdbReaders = SegmentReaderUtil.getReaders(path, conf); + + try { + Text key = new Text(); + HostDatum value = new HostDatum(); + for (int i = 0; i < hostdbReaders.length; i++) { + while (hostdbReaders[i].next(key, value)) { + hostDatumCache.put(key.toString(), (HostDatum)value.clone()); } - } catch (IOException e) { - LOG.error("Error closing HostDB because {}", e.getMessage()); } + } catch (Exception e) { + throw new IOException(e); + } + + for (int i = 0; i < hostdbReaders.length; i++) { + hostdbReaders[i].close(); } } @@ -402,6 +405,8 @@ public class Generator extends NutchTool implements Tool { fetchDelayExpr = JexlUtil .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null)); } + + readHostDb(); } @Override @@ -414,7 +419,7 @@ public class Generator extends NutchTool implements Tool { public void reduce(FloatWritable key, Iterable<SelectorEntry> values, Context context) throws IOException, InterruptedException { - String hostname = null; + String currentHostname = null; HostDatum host = null; LongWritable variableFetchDelayWritable = null; // in millis Text variableFetchDelayKey = new Text("_variableFetchDelay_"); @@ -425,33 +430,31 @@ public class Generator extends NutchTool implements Tool { String urlString = url.toString(); URL u = null; - // Do this only once per queue - if (host == null) { - try { - hostname = URLUtil.getHost(urlString); - host = getHostDatum(hostname); - } catch (Exception e) { - } + String hostname = URLUtil.getHost(urlString); + if (!hostname.equals(currentHostname)) { + currentHostname = hostname; + host = hostDatumCache.get(hostname); // Got it? - if (host == null) { - // Didn't work, prevent future lookups - host = new HostDatum(); - } else { + if (host != null) { if (maxCountExpr != null) { - long variableMaxCount = Math - .round((double) maxCountExpr.execute(createContext(host))); - LOG.info("Generator: variable maxCount: {} for {}", - variableMaxCount, hostname); - maxCount = (int) variableMaxCount; + try { + long variableMaxCount = Math.round((double)maxCountExpr.execute(createContext(host))); + LOG.debug("Generator: variable maxCount: {} for {}", variableMaxCount, hostname); + maxCount = (int)variableMaxCount; + } catch (Exception e) { + LOG.error("Unable to execute variable maxCount expression because: " + e.getMessage(), e); + } } if (fetchDelayExpr != null) { - long variableFetchDelay = Math - .round((double) fetchDelayExpr.execute(createContext(host))); - LOG.info("Generator: variable fetchDelay: {} ms for {}", - variableFetchDelay, hostname); - variableFetchDelayWritable = new LongWritable(variableFetchDelay); + try { + long variableFetchDelay = Math.round((double)fetchDelayExpr.execute(createContext(host))); + LOG.debug("Generator: variable fetchDelay: {} ms for {}", variableFetchDelay, hostname); + variableFetchDelayWritable = new LongWritable(variableFetchDelay); + } catch (Exception e) { + LOG.error("Unable to execute fetch delay expression because: " + e.getMessage(), e); + } } } } @@ -551,24 +554,6 @@ public class Generator extends NutchTool implements Tool { private String generateFileName(SelectorEntry entry) { return "fetchlist-" + entry.segnum.toString() + "/part"; } - - private HostDatum getHostDatum(String host) throws Exception { - Text key = new Text(); - HostDatum value = new HostDatum(); - - open(); - for (int i = 0; i < hostdbReaders.length; i++) { - while (hostdbReaders[i].next(key, value)) { - if (host.equals(key.toString())) { - close(); - return value; - } - } - } - - close(); - return null; - } } public static class DecreasingFloatComparator @@ -1072,7 +1057,7 @@ public class Generator extends NutchTool implements Tool { public int run(String[] args) throws Exception { if (args.length < 2) { System.out.println( - "Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]"); + "Usage: Generator <crawldb> <segments_dir> [-hostdb <hostdb>] [-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]"); return -1; }