[Nutch-dev] patch ready. Re: client for scheme file:// ?

john Sat, 05 Jun 2004 10:39:04 -0700

Hi, Doug and all,

Attached is a patch for file:// scheme. It will be useful for
(1) local/personal search
(2) development/testing
(3) a starting point of "desktop search"?
It's done on linux. If breaks on ms windows, please let me know
or provide a patch.


It can be downloaded at http://nutch.neasys.com/patch/, together with
a REQUIRED jar file jaf-1.0.2.jar. Check under 20040605. 
The jar file needs to be in ./nutch/lib.

Next, I am going to work on codes for ftp:// scheme and
parsers for PDF and MSDOC. Let me know if you've already worked on them.

John

--------------------------------- patch ----------------------------------------

diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605/conf/nutch-default.xml 
nutch-cvs-20040605.xing/conf/nutch-default.xml
--- nutch-cvs-20040605/conf/nutch-default.xml   2004-06-04 13:24:51.000000000 -0700
+++ nutch-cvs-20040605.xing/conf/nutch-default.xml      2004-06-04 23:32:28.000000000 
-0700
@@ -83,6 +83,27 @@
   <description>If true, HTTP will log more verbosely.</description>
 </property>
 
+<!-- FILE properties -->
+
+<property>
+  <name>file.content.limit</name>
+  <value>65536</value>
+  <description>The length limit for downloaded content, in bytes.
+  If this value is larger than zero, content longer than it will be
+  truncated; otherwise (zero or negative), no truncation at all.
+  </description>
+</property>
+
+<property>
+  <name>file.content.ignored</name>
+  <value>true</value>
+  <description>If true, no file content will be saved during fetch.
+  And it is probably what we want to set most of time, since file:// URLs
+  are meant to be local and we can always use them directly at parsing
+  and indexing stages. Otherwise file contents will be saved.
+  !! NO IMPLEMENTED YET !!
+  </description>
+</property>
 
 <!-- FTP properties -->
 
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605/src/plugin/build.xml nutch-cvs-20040605.xing/src/plugin/build.xml
--- nutch-cvs-20040605/src/plugin/build.xml     2004-06-04 13:24:53.000000000 -0700
+++ nutch-cvs-20040605.xing/src/plugin/build.xml        2004-06-05 00:23:08.000000000 
-0700
@@ -6,6 +6,7 @@
   <!-- Build & deploy all the plugin jars.                    -->
   <!-- ====================================================== -->
   <target name="deploy">
+    <ant dir="protocol-file" target="deploy"/>
     <ant dir="protocol-http" target="deploy"/>
     <ant dir="parse-html" target="deploy"/>
     <ant dir="parse-text" target="deploy"/>
@@ -24,6 +25,7 @@
   <!-- Clean all of the plugins.                              -->
   <!-- ====================================================== -->
   <target name="clean">
+    <ant dir="protocol-file" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
     <ant dir="parse-html" target="clean"/>
     <ant dir="parse-text" target="clean"/>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605/src/plugin/protocol-file/build.xml 
nutch-cvs-20040605.xing/src/plugin/protocol-file/build.xml
--- nutch-cvs-20040605/src/plugin/protocol-file/build.xml       1969-12-31 
16:00:00.000000000 -0800
+++ nutch-cvs-20040605.xing/src/plugin/protocol-file/build.xml  2004-06-03 
22:19:48.000000000 -0700
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="protocol-file" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605/src/plugin/protocol-file/plugin.xml 
nutch-cvs-20040605.xing/src/plugin/protocol-file/plugin.xml
--- nutch-cvs-20040605/src/plugin/protocol-file/plugin.xml      1969-12-31 
16:00:00.000000000 -0800
+++ nutch-cvs-20040605.xing/src/plugin/protocol-file/plugin.xml 2004-06-05 
09:24:09.000000000 -0700
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="protocol-file"
+   name="File Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <extension-point
+      id="net.nutch.protocol.Protocol"
+      name="Nutch Protocol"/>
+
+   <runtime>
+      <library name="protocol-file.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="net.nutch.protocol.file"
+              name="FileProtocol"
+              point="net.nutch.protocol.Protocol">
+
+      <implementation id="net.nutch.protocol.file.File"
+                      class="net.nutch.protocol.file.File"
+                      protocolName="file"/>
+
+   </extension>
+
+</plugin>
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileError.java
 
nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileError.java
--- 
nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileError.java
 1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileError.java
    2004-06-04 23:34:15.000000000 -0700
@@ -0,0 +1,19 @@
+/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.protocol.file;
+
+/** Thrown for File error codes.
+ */
+public class FileError extends FileException {
+
+  private int code;
+  
+  public int getCode(int code) { return code; }
+
+  public FileError(int code) {
+    super("File Error: " + code);
+    this.code = code;
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileException.java
 
nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileException.java
--- 
nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileException.java
     1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileException.java
        2004-06-04 09:46:04.000000000 -0700
@@ -0,0 +1,26 @@
+/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.protocol.file;
+
+import net.nutch.protocol.ProtocolException;
+
+public class FileException extends ProtocolException {
+
+  public FileException() {
+    super();
+  }
+
+  public FileException(String message) {
+    super(message);
+  }
+
+  public FileException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public FileException(Throwable cause) {
+    super(cause);
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/File.java 
nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/File.java
--- 
nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/File.java 
     1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/File.java
 2004-06-04 16:25:25.000000000 -0700
@@ -0,0 +1,196 @@
+/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.protocol.file;
+
+import javax.activation.MimetypesFileTypeMap;
+// 20040528, xing, disabled for now
+//import xing.net.nutch.util.magicfile.*;
+
+import net.nutch.net.protocols.HttpDateFormat;
+
+import net.nutch.util.LogFormatter;
+import net.nutch.util.NutchConf;
+
+import net.nutch.protocol.Content;
+import net.nutch.protocol.Protocol;
+
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import java.net.URL;
+
+import java.io.InputStream;
+// 20040528, xing, disabled for now
+//import java.io.Reader;
+import java.io.IOException;
+
+/************************************
+ * File.java deals with file: scheme.
+ *
+ * Configurable parameters are defined under "FILE properties" section
+ * in ./conf/nutch-default.xml or similar.
+ *
+ * @author John Xing
+ ***********************************/
+public class File implements Protocol {
+
+  public static final Logger LOG =
+    LogFormatter.getLogger("net.nutch.protocol.file.File");
+
+  static final int MAX_REDIRECTS = 5;
+
+  static int maxContentLength = NutchConf.getInt("file.content.limit",64*1024);
+
+  // 20040412, xing
+  // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
+  // are placed in each thread before we check out if they're thread-safe.
+
+  // http date format
+  HttpDateFormat httpDateFormat = null;
+
+  // file name extension to mime-type map
+  static MimetypesFileTypeMap TYPE_MAP = null;
+
+  static {
+    try {
+      // read mime types from config file
+      InputStream is =
+        NutchConf.getConfResourceAsInputStream
+        (NutchConf.get("mime.types.file"));
+      if (is == null) {
+        LOG.warning
+          ("no mime.types.file: won't use url extension for content-type.");
+        TYPE_MAP = null;
+      } else {
+        TYPE_MAP = new MimetypesFileTypeMap(is);
+      }
+      
+      if (is != null)
+        is.close();
+    } catch (IOException e) {
+      LOG.log(Level.SEVERE, "Unexpected error", e);
+    }
+  }
+
+// 20040528, xing, disabled for now
+//  // file magic for determining content type
+//  static MagicFile MAGIC = null;
+//
+//  static {
+//    try {
+//      // read file magic from config file
+//      Reader reader =
+//        NutchConf.getConfResourceAsReader
+//          (NutchConf.get("mime.magic.file"));
+//      if (reader == null) {
+//        LOG.warning
+//          ("no mime.magic.file: won't use file magic for content-type.");
+//        MAGIC = null;
+//      } else {
+//        MAGIC = MagicFile.getInstance(reader);
+//      }
+//
+//      if (reader != null)
+//        reader.close();
+//    } catch (IOException e) {
+//      LOG.log(Level.SEVERE, "Unexpected error", e);
+//    }
+//  }
+
+  // constructor
+  public File() {
+    this.httpDateFormat = new HttpDateFormat();
+  }
+
+  /** Set the point at which content is truncated. */
+  public void setMaxContentLength(int length) {this.maxContentLength = length;}
+
+  public Content getContent(String urlString) throws FileException {
+    try {
+      URL url = new URL(urlString);
+  
+      int redirects = 0;
+  
+      while (true) {
+        FileResponse response;
+        response = new FileResponse(urlString, url, this);   // make a request
+  
+        int code = response.getCode();
+  
+        if (code == 200) {                          // got a good response
+          return response.toContent();              // return it
+  
+        } else if (code >= 300 && code < 400) {     // handle redirect
+          if (redirects == MAX_REDIRECTS)
+            throw new FileException("Too many redirects: " + url);
+          url = new URL(response.getHeader("Location"));
+          redirects++;                
+          if (LOG.isLoggable(Level.FINE))
+            LOG.fine("redirect to " + url); 
+  
+        } else {                                    // convert to exception
+          throw new FileError(code);
+        }
+      } 
+    } catch (IOException e) {
+      throw new FileException(e);
+    }
+  }
+
+//  protected void finalize () {
+//    // nothing here
+//  }
+
+  /** For debugging. */
+  public static void main(String[] args) throws Exception {
+    int maxContentLength = Integer.MIN_VALUE;
+    String logLevel = "info";
+    boolean dumpContent = false;
+    String urlString = null;
+
+    String usage = "Usage: File [-logLevel level] [-maxContentLength L] 
[-dumpContent] url";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+      
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-logLevel")) {
+        logLevel = args[++i];
+      } else if (args[i].equals("-maxContentLength")) {
+        maxContentLength = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-dumpContent")) {
+        dumpContent = true;
+      } else if (i != args.length-1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else
+        urlString = args[i];
+    }
+
+    File file = new File();
+
+    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+      file.setMaxContentLength(maxContentLength);
+
+    // set log level
+    LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+
+    Content content = file.getContent(urlString);
+
+    System.err.println("Content-Type: "
+      + content.getContentType());
+    System.err.println("Content-Length: "
+      + content.getMetaData().get("Content-Length"));
+    System.err.println("Last-Modified: "
+      + content.getMetaData().get("Last-Modified"));
+    if (dumpContent) {
+      System.out.print(new String(content.getContent()));
+    }
+
+    file = null;
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' 
nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileResponse.java
 
nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileResponse.java
--- 
nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileResponse.java
      1969-12-31 16:00:00.000000000 -0800
+++ 
nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileResponse.java
 2004-06-04 16:23:19.000000000 -0700
@@ -0,0 +1,261 @@
+/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.protocol.file;
+
+import javax.activation.MimetypesFileTypeMap;
+// 20040528, xing, disabled for now
+//import xing.net.nutch.util.magicfile.*;
+
+import net.nutch.protocol.Content;
+
+import java.net.URL;
+
+import java.util.TreeMap;
+import java.util.Properties;
+
+import java.util.logging.Level;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+/************************************
+ * FileResponse.java mimics file replies as http response.
+ * It tries its best to follow http's way for headers, response codes
+ * as well as exceptions.
+ *
+ * Comments:
+ * (1) java.net.URL and java.net.URLConnection can handle file: scheme.
+ * However they are not flexible enough, so not used in this implementation.
+ *
+ * (2) java.io.File is used for its abstractness across platforms.
+ * Warning:
+ * java.io.File API (1.4.2) does not elaborate on how special files,
+ * such as /dev/* in unix and /proc/* on linux, are treated. Tests show
+ *  (a) java.io.File.isFile() return false for /dev/*
+ *  (b) java.io.File.isFile() return true for /proc/*
+ *  (c) java.io.File.length() return 0 for /proc/*
+ * We are probably oaky for now. Could be buggy here.
+ * How about special files on windows?
+ *
+ * (3) java.io.File API (1.4.2) does not seem to know unix hard link files.
+ * They are just treated as individual files.
+ *
+ * (4) No funcy POSIX file attributes yet. May never need?
+ *
+ * @author John Xing
+ ***********************************/
+public class FileResponse {
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Properties headers = new Properties();
+
+  private final File file;
+
+  /** Returns the response code. */
+  public int getCode() { return code; }
+
+  /** Returns the value of a named header. */
+  public String getHeader(String name) {
+    return (String)headers.get(name);
+  }
+
+  public byte[] getContent() { return content; }
+
+  public Content toContent() {
+    return new Content(orig, base, content,
+                       getHeader("Content-Type"),
+                       headers);
+  }
+
+  public FileResponse(URL url, File file)
+    throws FileException, IOException {
+    this(url.toString(), url, file);
+  }
+
+  public FileResponse(String orig, URL url, File file)
+    throws FileException, IOException {
+
+    this.orig = orig;
+    this.base = url.toString();
+    this.file = file;
+
+    if (!"file".equals(url.getProtocol()))
+      throw new FileException("Not a file url:" + url);
+
+    if (File.LOG.isLoggable(Level.FINE))
+      File.LOG.fine("fetching " + url);
+
+    if (url.getPath() != url.getFile())
+      File.LOG.warning("url.getPath() != url.getFile(): " + url);
+
+    String path = "".equals(url.getPath()) ? "/" : url.getPath();
+
+    try {
+
+      this.content = null;
+
+      // url.toURI() is only in j2se 1.5.0
+      //java.io.File f = new java.io.File(url.toURI());
+      java.io.File f = new java.io.File(path);
+
+      if (!f.exists()) {
+        this.code = 404;  // http Not Found
+        return;
+      }
+
+      if (!f.canRead()) {
+        this.code = 401;  // http Unauthorized
+        return;
+      }
+
+      // symbolic link or relative path on unix
+      // fix me: what's the consequence on windows platform
+      // where case is insensitive
+      if (!f.equals(f.getCanonicalFile())) {
+        // set headers
+        TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER);
+        //hdrs.put("Location", f.getCanonicalFile().toURI());
+        hdrs.put("Location", f.getCanonicalFile().toURL().toString());
+        this.headers.putAll(hdrs);
+
+        this.code = 300;  // http redirect
+        return;
+      }
+
+      if (f.isDirectory()) {
+        getDirAsHttpResponse(f);
+      } else if (f.isFile()) {
+        getFileAsHttpResponse(f);
+      } else {
+        this.code = 500; // http Internal Server Error
+        return;
+      }
+
+    } catch (IOException e) {
+      throw e;
+    }
+
+  }
+
+  // get file as http response
+  private void getFileAsHttpResponse(java.io.File f)
+    throws FileException, IOException {
+
+    // ignore file of size larger than
+    // Integer.MAX_VALUE = 2^31-1 = 2147483647
+    long size = f.length();
+    if (size > Integer.MAX_VALUE) {
+      throw new FileException("file is too large, size: "+size);
+      // or we can do this?
+      // this.code = 400;  // http Bad request
+      // return;
+    }
+
+    // capture content
+    int len = (int) size;
+    
+    if (this.file.maxContentLength > 0 && len > this.file.maxContentLength)
+      len = this.file.maxContentLength;
+
+    this.content = new byte[len];
+
+    java.io.InputStream is = new java.io.FileInputStream(f);
+    int offset = 0; int n = 0;
+    while (offset < len
+      && (n = is.read(this.content, offset, len-offset)) >= 0) {
+      offset += n;
+    }
+    if (offset < len) // keep whatever already have, but issue a warning
+      File.LOG.warning("not enough bytes read from file: "+f.getPath());
+    is.close(); 
+
+    // set headers
+    TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER);
+
+    hdrs.put("Content-Length", new Long(size).toString());
+
+    hdrs.put("Last-Modified",
+      this.file.httpDateFormat.toString(f.lastModified()));
+
+    String contentType = null;
+// 20040528, xing, disabled for now
+//      if (contentType == null && this.file.MAGIC != null)
+//        contentType = this.file.MAGIC.getMimeType(this.content);
+    if (contentType == null && this.file.TYPE_MAP != null)
+      contentType = this.file.TYPE_MAP.getContentType(f.getName());
+    if (contentType != null)
+      hdrs.put("Content-Type", contentType);
+
+    this.headers.putAll(hdrs);
+
+    // response code
+    this.code = 200; // http OK
+  }
+
+  // get dir list as http response
+  private void getDirAsHttpResponse(java.io.File f)
+    throws IOException {
+
+    String path = f.toString();
+    this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true);
+
+    // set headers
+    TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER);
+
+    hdrs.put("Content-Length",
+      new Integer(this.content.length).toString());
+
+    hdrs.put("Content-Type", "text/html");
+
+    hdrs.put("Last-Modified",
+      this.file.httpDateFormat.toString(f.lastModified()));
+
+    this.headers.putAll(hdrs);
+
+    // response code
+    this.code = 200; // http OK
+  }
+
+  // generate html page from dir list
+  private byte[] list2html(java.io.File[] list,
+    String path, boolean includeDotDot) {
+
+    StringBuffer x = new StringBuffer("<html><head>");
+    x.append("<title>Index of "+path+"</title></head>\n");
+    x.append("<body><h1>Index of "+path+"</h1><pre>\n");
+
+    if (includeDotDot) {
+      x.append("<a href='../'>../</a>\t-\t-\t-\n");
+    }
+
+    // fix me: we might want to sort list here! but not now.
+
+    java.io.File f;
+    for (int i=0; i<list.length; i++) {
+      f = list[i];
+      String name = f.getName();
+      String time = this.file.httpDateFormat.toString(f.lastModified());
+      if (f.isDirectory()) {
+        // java 1.4.2 api says dir itself and parent dir are not listed
+        // so the following is not needed.
+        //if (name.equals(".") || name.equals(".."))
+        //  continue;
+        x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
+        x.append(time+"\t-\n");
+      } else if (f.isFile()) {
+        x.append("<a href='"+name+    "'>"+name+"</a>\t");
+        x.append(time+"\t"+f.length()+"\n");
+      } else {
+        // ignore any other
+      }
+    }
+
+    x.append("</pre></body></html>\n");
+
+    return new String(x).getBytes();
+  }
+
+}


-------------------------------------------------------
This SF.Net email is sponsored by the new InstallShield X.
>From Windows to Linux, servers to mobile, InstallShield X is the one
installation-authoring solution that does it all. Learn more and
evaluate today! http://www.installshield.com/Dev2Dev/0504
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers

[Nutch-dev] patch ready. Re: client for scheme file:// ?

Reply via email to