[nutch] branch master updated: NUTCH-2689 Speed up urlfilter-regex and urlfilter-automaton - do not extract host and domain name from the URL if not needed - speed up regular expressions: - use non-capturing groups if possible - use (?i) to make the patterns case insensitiven and remove uppercase variants to keep alternations shorter

snagel Tue, 29 Jan 2019 02:32:09 -0800

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new f87b19b  NUTCH-2689 Speed up urlfilter-regex and urlfilter-automaton - 
do not extract host and domain name from the URL if not needed - speed up 
regular expressions:   - use non-capturing groups if possible   - use (?i) to 
make the patterns case insensitiven and     remove uppercase variants to keep 
alternations shorter
     new da8f3f5  Merge pull request #432 from 
sebastian-nagel/NUTCH-2689-urlfilter-regex-speed-up
f87b19b is described below

commit f87b19b0ee8a01c5f54f5ed4b6b159169705682f
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Tue Jan 22 14:45:29 2019 +0100

    NUTCH-2689 Speed up urlfilter-regex and urlfilter-automaton
    - do not extract host and domain name from the URL if not needed
    - speed up regular expressions:
      - use non-capturing groups if possible
      - use (?i) to make the patterns case insensitiven and
        remove uppercase variants to keep alternations shorter
---
 conf/regex-urlfilter.txt.template                  |  4 +--
 .../nutch/urlfilter/api/RegexURLFilterBase.java    | 40 +++++++++++++---------
 src/plugin/urlfilter-regex/sample/Benchmarks.rules | 12 +++----
 .../urlfilter-regex/sample/IntranetCrawling.rules  |  6 ++--
 .../urlfilter-regex/sample/WholeWebCrawling.rules  |  4 +--
 5 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/conf/regex-urlfilter.txt.template 
b/conf/regex-urlfilter.txt.template
index 4319bf1..1448642 100644
--- a/conf/regex-urlfilter.txt.template
+++ b/conf/regex-urlfilter.txt.template
@@ -24,14 +24,14 @@
 # matches, the URL is ignored.
 
 # skip file: ftp: and mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
 
 # skip URLs longer than 2048 characters, see also db.max.outlink.length
 #-^.{2049,}
 
 # skip image and other suffixes we can't yet parse
 # for a more extensive coverage use the urlfilter-suffix plugin
--(?i)\.(gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$
+-(?i)\.(?:gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
diff --git 
a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 
b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index ecbe29d..993b37d 100644
--- 
a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ 
b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -69,6 +69,14 @@ public abstract class RegexURLFilterBase implements 
URLFilter {
   private Configuration conf;
 
   /**
+   * Whether there are host- or domain-specific rules. If there are no specific
+   * rules host and domain name are not extracted from the URL to speed up the
+   * matching. {@link #readRules(Reader)} automatically sets this to true if
+   * host- or domain-specific rules are used in the rule file.
+   */
+  protected boolean hasHostDomainRules = false;
+
+  /**
    * Constructs a new empty RegexURLFilterBase
    */
   public RegexURLFilterBase() {
@@ -154,34 +162,33 @@ public abstract class RegexURLFilterBase implements 
URLFilter {
 
   // Inherited Javadoc
   public String filter(String url) {
-    String host = URLUtil.getHost(url);
+    String host = null;
     String domain = null;
-    
-    try {
-      domain = URLUtil.getDomainName(url);
-    } catch (MalformedURLException e) {
-      // shouldnt happen here right?
+
+    if (hasHostDomainRules) {
+      host = URLUtil.getHost(url);
+      try {
+        domain = URLUtil.getDomainName(url);
+      } catch (MalformedURLException e) {
+        // shouldnt happen here right?
+      }
+
+      LOG.debug("URL belongs to host {} and domain {}", host, domain);
     }
     
-    if (LOG.isDebugEnabled()) {
-      LOG.debug("URL belongs to host " + host + " and domain " + domain);
-    }
-
     for (RegexRule rule : rules) {
       // Skip the skip for rules that don't share the same host and domain
       if (rule.hostOrDomain() != null &&
             !rule.hostOrDomain().equals(host) &&
             !rule.hostOrDomain().equals(domain)) {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + 
rule.hostOrDomain());
-        }
+        LOG.debug("Skipping rule [{}] for host: {}", rule.regex(),
+            rule.hostOrDomain());
 
         continue;
       }
     
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " 
and domain " + domain);
-      }
+      LOG.debug("Applying rule [{}] for host {} and domain {}", rule.regex(),
+          host, domain);
 
       if (rule.match(url)) {
         return rule.accept() ? url : null;
@@ -265,6 +272,7 @@ public abstract class RegexURLFilterBase implements 
URLFilter {
         continue;
       case '>':
         hostOrDomain = line.substring(1).trim();
+        hasHostDomainRules = true;
         continue;
       case '<':
         hostOrDomain = null;
diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.rules 
b/src/plugin/urlfilter-regex/sample/Benchmarks.rules
index c8901e2..6a85118 100644
--- a/src/plugin/urlfilter-regex/sample/Benchmarks.rules
+++ b/src/plugin/urlfilter-regex/sample/Benchmarks.rules
@@ -9,18 +9,18 @@
 # matches, the URL is ignored.
 
 # skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
 
 # skip .fr .org and .net domains
--^.*//.*\.fr/
--^.*//.*\.org/
--^.*//.*\.net/
+-^[^/]*//[^/]*\.fr/
+-^[^/]*//[^/]*\.org/
+-^[^/]*//[^/]*\.net/
 
-# skip everything else
+# accept everything else
 +.
diff --git a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules 
b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
index 705bdb2..e651dd5 100644
--- a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
+++ b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
@@ -9,10 +9,10 @@
 # matches, the URL is ignored.
 
 # skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
@@ -21,7 +21,7 @@
 -.*(/.+?)/.*?\1/.*?\1/
 
 # accept hosts in MY.DOMAIN.NAME
-+^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
++^https?://(?:[a-z0-9]*\.)*MY.DOMAIN.NAME/
 
 # skip everything else
 -.
diff --git a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules 
b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
index 8778921..ac9ad60 100644
--- a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
+++ b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
@@ -7,10 +7,10 @@
 # matches, the URL is ignored.
 
 # skip file: ftp: and mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]

Reply via email to