Author: dogacan
Date: Mon Jun 18 11:13:15 2007
New Revision: 548429

URL: http://svn.apache.org/viewvc?view=rev&rev=548429
Log:
NUTCH-489 - URLFilter-suffix management of the url path when the url contains 
some query parameters.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/suffix-urlfilter.txt
    
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
    
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548429&r1=548428&r2=548429
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Jun 18 11:13:15 2007
@@ -41,6 +41,8 @@
     
 13. NUTCH-485 - Change HtmlParseFilter 's to return ParseResult object instead 
of Parse object. (Gal Nitzan via dogacan)
 
+14. NUTCH-489 - URLFilter-suffix management of the url path when the url 
contains some query parameters. (Emmanuel Joke via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/conf/suffix-urlfilter.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/suffix-urlfilter.txt?view=diff&rev=548429&r1=548428&r2=548429
==============================================================================
--- lucene/nutch/trunk/conf/suffix-urlfilter.txt (original)
+++ lucene/nutch/trunk/conf/suffix-urlfilter.txt Mon Jun 18 11:13:15 2007
@@ -2,6 +2,8 @@
 
 # case-insensitive, allow unknown suffixes
 +I
+# uncomment the line below to filter on url path
+#+P
 
 ### prohibit these
 # pictures

Modified: 
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?view=diff&rev=548429&r1=548428&r2=548429
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
 Mon Jun 18 11:13:15 2007
@@ -22,7 +22,6 @@
 
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.SuffixStringMatcher;
-import org.apache.nutch.util.TrieStringMatcher;
 
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;
@@ -39,6 +38,9 @@
 import java.util.List;
 import java.util.ArrayList;
 
+import java.net.URL;
+import java.net.MalformedURLException;
+
 /**
  * Filters URLs based on a file of URL suffixes. The file is named by
  * <ol>
@@ -127,7 +129,7 @@
 
   private SuffixStringMatcher suffixes;
   private boolean modeAccept = false;
-
+  private boolean filterFromPath = false;
   private boolean ignoreCase = false;
 
   private Configuration conf;
@@ -146,6 +148,15 @@
     if (ignoreCase)
       _url = url.toLowerCase();
     else _url = url;
+    if (filterFromPath) {
+      try {
+        URL pUrl = new URL(_url);
+        _url = pUrl.getPath();
+      } catch (MalformedURLException e) {
+        // don't care
+      }
+    }
+
     String a = suffixes.shortestMatch(_url);
     if (a == null) {
       if (modeAccept) return url;
@@ -185,12 +196,16 @@
           break;
         case '-':
           allow = false;
-          if (line.length() > 1 && line.charAt(1) == 'I')
+          if(line.contains("P"))
+            filterFromPath = true;
+          if(line.contains("I"))
             ignore = true;
           break;
         case '+':
           allow = true;
-          if (line.length() > 1 && line.charAt(1) == 'I')
+          if(line.contains("P"))
+            filterFromPath = true;
+          if(line.contains("I"))
             ignore = true;
           break;
         default:
@@ -284,5 +299,9 @@
 
   public void setIgnoreCase(boolean ignoreCase) {
     this.ignoreCase = ignoreCase;
+  }
+
+  public void setFilterFromPath(boolean filterFromPath) {
+    this.filterFromPath = filterFromPath;
   }
 }

Modified: 
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?view=diff&rev=548429&r1=548428&r2=548429
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
 Mon Jun 18 11:13:15 2007
@@ -35,7 +35,8 @@
     "# this is a comment\n" +
     "\n" +
     ".gif\n" +
-    ".jpg\n";
+    ".jpg\n" +
+    ".js\n";
   
   private static final String[] urls = new String[] {
     "http://www.example.com/test.gif";,
@@ -44,6 +45,8 @@
     "http://www.example.com/test.JPG";,
     "http://www.example.com/test.html";,
     "http://www.example.com/test.HTML";,
+    "http://www.example.com/test.html?q=abc.js";,
+    "http://www.example.com/test.js?foo=bar&baz=bar#12333";,
   };
   
   private static String[] urlsModeAccept = new String[] {
@@ -52,7 +55,9 @@
     null,
     urls[3],
     urls[4],
-    urls[5]
+    urls[5],
+    null,
+    urls[7]
   };
   
   private static String[] urlsModeReject = new String[] {
@@ -61,6 +66,8 @@
     urls[2],
     null,
     null,
+    null,
+    urls[6],
     null
   };
   
@@ -70,18 +77,44 @@
     null,
     null,
     urls[4],
-    urls[5]
+    urls[5],
+    null,
+    urls[7]
   };
-  
+ 
   private static String[] urlsModeRejectIgnoreCase = new String[] {
     urls[0],
     urls[1],
     urls[2],
     urls[3],
     null,
+    null,
+    urls[6],
+    null
+  };
+  
+  private static String[] urlsModeAcceptAndPathFilter = new String[] {
+    null,
+    urls[1],
+    null,
+    urls[3],
+    urls[4],
+    urls[5],
+    urls[6],
     null
   };
   
+  private static String[] urlsModeAcceptAndNonPathFilter = new String[] {
+    null,
+    urls[1],
+    null,
+    urls[3],
+    urls[4],
+    urls[5],
+    null,
+    urls[7]
+  };
+  
   private SuffixURLFilter filter = null;
   
   public TestSuffixURLFilter(String testName) {
@@ -129,6 +162,22 @@
     filter.setModeAccept(false);
     for (int i = 0; i < urls.length; i++) {
       assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i]));
+    }
+  }
+  
+  public void testModeAcceptAndNonPathFilter() {
+    filter.setModeAccept(true);
+    filter.setFilterFromPath(false);
+    for (int i = 0; i < urls.length; i++) {
+      assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter.filter(urls[i]));
+    }
+  }
+  
+  public void testModeAcceptAndPathFilter() {
+    filter.setModeAccept(true);
+    filter.setFilterFromPath(true);
+    for (int i = 0; i < urls.length; i++) {
+      assertTrue(urlsModeAcceptAndPathFilter[i] == filter.filter(urls[i]));
     }
   }
 


Reply via email to