This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new f87b19b NUTCH-2689 Speed up urlfilter-regex and urlfilter-automaton - do not extract host and domain name from the URL if not needed - speed up regular expressions: - use non-capturing groups if possible - use (?i) to make the patterns case insensitiven and remove uppercase variants to keep alternations shorter new da8f3f5 Merge pull request #432 from sebastian-nagel/NUTCH-2689-urlfilter-regex-speed-up f87b19b is described below commit f87b19b0ee8a01c5f54f5ed4b6b159169705682f Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Tue Jan 22 14:45:29 2019 +0100 NUTCH-2689 Speed up urlfilter-regex and urlfilter-automaton - do not extract host and domain name from the URL if not needed - speed up regular expressions: - use non-capturing groups if possible - use (?i) to make the patterns case insensitiven and remove uppercase variants to keep alternations shorter --- conf/regex-urlfilter.txt.template | 4 +-- .../nutch/urlfilter/api/RegexURLFilterBase.java | 40 +++++++++++++--------- src/plugin/urlfilter-regex/sample/Benchmarks.rules | 12 +++---- .../urlfilter-regex/sample/IntranetCrawling.rules | 6 ++-- .../urlfilter-regex/sample/WholeWebCrawling.rules | 4 +-- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/conf/regex-urlfilter.txt.template b/conf/regex-urlfilter.txt.template index 4319bf1..1448642 100644 --- a/conf/regex-urlfilter.txt.template +++ b/conf/regex-urlfilter.txt.template @@ -24,14 +24,14 @@ # matches, the URL is ignored. # skip file: ftp: and mailto: urls --^(file|ftp|mailto): +-^(?:file|ftp|mailto): # skip URLs longer than 2048 characters, see also db.max.outlink.length #-^.{2049,} # skip image and other suffixes we can't yet parse # for a more extensive coverage use the urlfilter-suffix plugin --(?i)\.(gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$ +-(?i)\.(?:gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$ # skip URLs containing certain characters as probable queries, etc. -[?*!@=] diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java index ecbe29d..993b37d 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -69,6 +69,14 @@ public abstract class RegexURLFilterBase implements URLFilter { private Configuration conf; /** + * Whether there are host- or domain-specific rules. If there are no specific + * rules host and domain name are not extracted from the URL to speed up the + * matching. {@link #readRules(Reader)} automatically sets this to true if + * host- or domain-specific rules are used in the rule file. + */ + protected boolean hasHostDomainRules = false; + + /** * Constructs a new empty RegexURLFilterBase */ public RegexURLFilterBase() { @@ -154,34 +162,33 @@ public abstract class RegexURLFilterBase implements URLFilter { // Inherited Javadoc public String filter(String url) { - String host = URLUtil.getHost(url); + String host = null; String domain = null; - - try { - domain = URLUtil.getDomainName(url); - } catch (MalformedURLException e) { - // shouldnt happen here right? + + if (hasHostDomainRules) { + host = URLUtil.getHost(url); + try { + domain = URLUtil.getDomainName(url); + } catch (MalformedURLException e) { + // shouldnt happen here right? + } + + LOG.debug("URL belongs to host {} and domain {}", host, domain); } - if (LOG.isDebugEnabled()) { - LOG.debug("URL belongs to host " + host + " and domain " + domain); - } - for (RegexRule rule : rules) { // Skip the skip for rules that don't share the same host and domain if (rule.hostOrDomain() != null && !rule.hostOrDomain().equals(host) && !rule.hostOrDomain().equals(domain)) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain()); - } + LOG.debug("Skipping rule [{}] for host: {}", rule.regex(), + rule.hostOrDomain()); continue; } - if (LOG.isDebugEnabled()) { - LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain); - } + LOG.debug("Applying rule [{}] for host {} and domain {}", rule.regex(), + host, domain); if (rule.match(url)) { return rule.accept() ? url : null; @@ -265,6 +272,7 @@ public abstract class RegexURLFilterBase implements URLFilter { continue; case '>': hostOrDomain = line.substring(1).trim(); + hasHostDomainRules = true; continue; case '<': hostOrDomain = null; diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.rules b/src/plugin/urlfilter-regex/sample/Benchmarks.rules index c8901e2..6a85118 100644 --- a/src/plugin/urlfilter-regex/sample/Benchmarks.rules +++ b/src/plugin/urlfilter-regex/sample/Benchmarks.rules @@ -9,18 +9,18 @@ # matches, the URL is ignored. # skip file:, ftp:, & mailto: urls --^(file|ftp|mailto): +-^(?:file|ftp|mailto): # skip image and other suffixes we can't yet parse --\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ +-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$ # skip URLs containing certain characters as probable queries, etc. -[?*!@=] # skip .fr .org and .net domains --^.*//.*\.fr/ --^.*//.*\.org/ --^.*//.*\.net/ +-^[^/]*//[^/]*\.fr/ +-^[^/]*//[^/]*\.org/ +-^[^/]*//[^/]*\.net/ -# skip everything else +# accept everything else +. diff --git a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules index 705bdb2..e651dd5 100644 --- a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules +++ b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules @@ -9,10 +9,10 @@ # matches, the URL is ignored. # skip file:, ftp:, & mailto: urls --^(file|ftp|mailto): +-^(?:file|ftp|mailto): # skip image and other suffixes we can't yet parse --\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ +-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$ # skip URLs containing certain characters as probable queries, etc. -[?*!@=] @@ -21,7 +21,7 @@ -.*(/.+?)/.*?\1/.*?\1/ # accept hosts in MY.DOMAIN.NAME -+^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/ ++^https?://(?:[a-z0-9]*\.)*MY.DOMAIN.NAME/ # skip everything else -. diff --git a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules index 8778921..ac9ad60 100644 --- a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules +++ b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules @@ -7,10 +7,10 @@ # matches, the URL is ignored. # skip file: ftp: and mailto: urls --^(file|ftp|mailto): +-^(?:file|ftp|mailto): # skip image and other suffixes we can't yet parse --\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$ +-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe)$ # skip URLs containing certain characters as probable queries, etc. -[?*!@=]