Author: dogacan
Date: Mon Jun 18 11:13:15 2007
New Revision: 548429
URL: http://svn.apache.org/viewvc?view=rev&rev=548429
Log:
NUTCH-489 - URLFilter-suffix management of the url path when the url contains
some query parameters.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/suffix-urlfilter.txt
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548429&r1=548428&r2=548429
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Jun 18 11:13:15 2007
@@ -41,6 +41,8 @@
13. NUTCH-485 - Change HtmlParseFilter 's to return ParseResult object instead
of Parse object. (Gal Nitzan via dogacan)
+14. NUTCH-489 - URLFilter-suffix management of the url path when the url
contains some query parameters. (Emmanuel Joke via dogacan)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/conf/suffix-urlfilter.txt
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/suffix-urlfilter.txt?view=diff&rev=548429&r1=548428&r2=548429
==============================================================================
--- lucene/nutch/trunk/conf/suffix-urlfilter.txt (original)
+++ lucene/nutch/trunk/conf/suffix-urlfilter.txt Mon Jun 18 11:13:15 2007
@@ -2,6 +2,8 @@
# case-insensitive, allow unknown suffixes
+I
+# uncomment the line below to filter on url path
+#+P
### prohibit these
# pictures
Modified:
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?view=diff&rev=548429&r1=548428&r2=548429
==============================================================================
---
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
Mon Jun 18 11:13:15 2007
@@ -22,7 +22,6 @@
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.SuffixStringMatcher;
-import org.apache.nutch.util.TrieStringMatcher;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
@@ -39,6 +38,9 @@
import java.util.List;
import java.util.ArrayList;
+import java.net.URL;
+import java.net.MalformedURLException;
+
/**
* Filters URLs based on a file of URL suffixes. The file is named by
* <ol>
@@ -127,7 +129,7 @@
private SuffixStringMatcher suffixes;
private boolean modeAccept = false;
-
+ private boolean filterFromPath = false;
private boolean ignoreCase = false;
private Configuration conf;
@@ -146,6 +148,15 @@
if (ignoreCase)
_url = url.toLowerCase();
else _url = url;
+ if (filterFromPath) {
+ try {
+ URL pUrl = new URL(_url);
+ _url = pUrl.getPath();
+ } catch (MalformedURLException e) {
+ // don't care
+ }
+ }
+
String a = suffixes.shortestMatch(_url);
if (a == null) {
if (modeAccept) return url;
@@ -185,12 +196,16 @@
break;
case '-':
allow = false;
- if (line.length() > 1 && line.charAt(1) == 'I')
+ if(line.contains("P"))
+ filterFromPath = true;
+ if(line.contains("I"))
ignore = true;
break;
case '+':
allow = true;
- if (line.length() > 1 && line.charAt(1) == 'I')
+ if(line.contains("P"))
+ filterFromPath = true;
+ if(line.contains("I"))
ignore = true;
break;
default:
@@ -284,5 +299,9 @@
public void setIgnoreCase(boolean ignoreCase) {
this.ignoreCase = ignoreCase;
+ }
+
+ public void setFilterFromPath(boolean filterFromPath) {
+ this.filterFromPath = filterFromPath;
}
}
Modified:
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?view=diff&rev=548429&r1=548428&r2=548429
==============================================================================
---
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
(original)
+++
lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
Mon Jun 18 11:13:15 2007
@@ -35,7 +35,8 @@
"# this is a comment\n" +
"\n" +
".gif\n" +
- ".jpg\n";
+ ".jpg\n" +
+ ".js\n";
private static final String[] urls = new String[] {
"http://www.example.com/test.gif",
@@ -44,6 +45,8 @@
"http://www.example.com/test.JPG",
"http://www.example.com/test.html",
"http://www.example.com/test.HTML",
+ "http://www.example.com/test.html?q=abc.js",
+ "http://www.example.com/test.js?foo=bar&baz=bar#12333",
};
private static String[] urlsModeAccept = new String[] {
@@ -52,7 +55,9 @@
null,
urls[3],
urls[4],
- urls[5]
+ urls[5],
+ null,
+ urls[7]
};
private static String[] urlsModeReject = new String[] {
@@ -61,6 +66,8 @@
urls[2],
null,
null,
+ null,
+ urls[6],
null
};
@@ -70,18 +77,44 @@
null,
null,
urls[4],
- urls[5]
+ urls[5],
+ null,
+ urls[7]
};
-
+
private static String[] urlsModeRejectIgnoreCase = new String[] {
urls[0],
urls[1],
urls[2],
urls[3],
null,
+ null,
+ urls[6],
+ null
+ };
+
+ private static String[] urlsModeAcceptAndPathFilter = new String[] {
+ null,
+ urls[1],
+ null,
+ urls[3],
+ urls[4],
+ urls[5],
+ urls[6],
null
};
+ private static String[] urlsModeAcceptAndNonPathFilter = new String[] {
+ null,
+ urls[1],
+ null,
+ urls[3],
+ urls[4],
+ urls[5],
+ null,
+ urls[7]
+ };
+
private SuffixURLFilter filter = null;
public TestSuffixURLFilter(String testName) {
@@ -129,6 +162,22 @@
filter.setModeAccept(false);
for (int i = 0; i < urls.length; i++) {
assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i]));
+ }
+ }
+
+ public void testModeAcceptAndNonPathFilter() {
+ filter.setModeAccept(true);
+ filter.setFilterFromPath(false);
+ for (int i = 0; i < urls.length; i++) {
+ assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter.filter(urls[i]));
+ }
+ }
+
+ public void testModeAcceptAndPathFilter() {
+ filter.setModeAccept(true);
+ filter.setFilterFromPath(true);
+ for (int i = 0; i < urls.length; i++) {
+ assertTrue(urlsModeAcceptAndPathFilter[i] == filter.filter(urls[i]));
}
}