Author: jnioche
Date: Tue Jan  4 19:26:16 2011
New Revision: 1055145

URL: http://svn.apache.org/viewvc?rev=1055145&view=rev
Log:
applied NUTCH-905  to 1.3

Modified:
    nutch/branches/branch-1.3/CHANGES.txt
    nutch/branches/branch-1.3/conf/nutch-default.xml
    
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
    
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1055145&r1=1055144&r2=1055145&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Tue Jan  4 19:26:16 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.3 - Current Development
 
+* NUTCH-905 Configurable file protocol parent directory crawling (Thorsten 
Scherler, mattmann, ab)
+
 * NUTCH-787 ScoringFilters should not override the injected score (jnioche)
 
 * NUTCH-949 Conflicting ANT jars in classpath (jnioche)

Modified: nutch/branches/branch-1.3/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/nutch-default.xml?rev=1055145&r1=1055144&r2=1055145&view=diff
==============================================================================
--- nutch/branches/branch-1.3/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.3/conf/nutch-default.xml Tue Jan  4 19:26:16 2011
@@ -33,6 +33,15 @@
   confuse this setting with the http.content.limit setting.
   </description>
 </property>
+  
+<property>
+  <name>file.crawl.parent</name>
+  <value>true</value>
+  <description>The crawler is not restricted to the directories that you 
specified in the
+    Urls file but it is jumping into the parent directories as well. For your 
own crawlings you can
+    change this bahavior (set to false) the way that only directories beneath 
the directories that you specify get
+    crawled.</description>
+</property>
 
 <property>
   <name>file.content.ignored</name>

Modified: 
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1055145&r1=1055144&r2=1055145&view=diff
==============================================================================
--- 
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Tue Jan  4 19:26:16 2011
@@ -53,6 +53,7 @@ public class File implements Protocol {
   static final int MAX_REDIRECTS = 5;
 
   int maxContentLength;
+  boolean crawlParents;
 
   private Configuration conf;
 
@@ -156,6 +157,7 @@ public class File implements Protocol {
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+    this.crawlParents = conf.getBoolean("file.crawl.parent", true);
   }
 
   public Configuration getConf() {

Modified: 
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1055145&r1=1055144&r2=1055145&view=diff
==============================================================================
--- 
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Tue Jan  4 19:26:16 2011
@@ -222,7 +222,10 @@ public class FileResponse {
     throws IOException {
 
     String path = f.toString();
-    this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : 
true);
+    if (this.file.crawlParents)
+        this.content = list2html(f.listFiles(), path, "/".equals(path) ? false 
: true);
+    else
+        this.content = list2html(f.listFiles(), path, false);
 
     // set headers
     headers.set(Response.CONTENT_LENGTH,


Reply via email to