Author: jnioche
Date: Tue Jan 4 19:26:16 2011
New Revision: 1055145
URL: http://svn.apache.org/viewvc?rev=1055145&view=rev
Log:
applied NUTCH-905 to 1.3
Modified:
nutch/branches/branch-1.3/CHANGES.txt
nutch/branches/branch-1.3/conf/nutch-default.xml
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Modified: nutch/branches/branch-1.3/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1055145&r1=1055144&r2=1055145&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Tue Jan 4 19:26:16 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.3 - Current Development
+* NUTCH-905 Configurable file protocol parent directory crawling (Thorsten
Scherler, mattmann, ab)
+
* NUTCH-787 ScoringFilters should not override the injected score (jnioche)
* NUTCH-949 Conflicting ANT jars in classpath (jnioche)
Modified: nutch/branches/branch-1.3/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/nutch-default.xml?rev=1055145&r1=1055144&r2=1055145&view=diff
==============================================================================
--- nutch/branches/branch-1.3/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.3/conf/nutch-default.xml Tue Jan 4 19:26:16 2011
@@ -33,6 +33,15 @@
confuse this setting with the http.content.limit setting.
</description>
</property>
+
+<property>
+ <name>file.crawl.parent</name>
+ <value>true</value>
+ <description>The crawler is not restricted to the directories that you
specified in the
+ Urls file but it is jumping into the parent directories as well. For your
own crawlings you can
+ change this bahavior (set to false) the way that only directories beneath
the directories that you specify get
+ crawled.</description>
+</property>
<property>
<name>file.content.ignored</name>
Modified:
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1055145&r1=1055144&r2=1055145&view=diff
==============================================================================
---
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
(original)
+++
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
Tue Jan 4 19:26:16 2011
@@ -53,6 +53,7 @@ public class File implements Protocol {
static final int MAX_REDIRECTS = 5;
int maxContentLength;
+ boolean crawlParents;
private Configuration conf;
@@ -156,6 +157,7 @@ public class File implements Protocol {
public void setConf(Configuration conf) {
this.conf = conf;
this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+ this.crawlParents = conf.getBoolean("file.crawl.parent", true);
}
public Configuration getConf() {
Modified:
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1055145&r1=1055144&r2=1055145&view=diff
==============================================================================
---
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
(original)
+++
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Tue Jan 4 19:26:16 2011
@@ -222,7 +222,10 @@ public class FileResponse {
throws IOException {
String path = f.toString();
- this.content = list2html(f.listFiles(), path, "/".equals(path) ? false :
true);
+ if (this.file.crawlParents)
+ this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
: true);
+ else
+ this.content = list2html(f.listFiles(), path, false);
// set headers
headers.set(Response.CONTENT_LENGTH,