Hi, Doug and all, Attached is a patch for file:// scheme. It will be useful for (1) local/personal search (2) development/testing (3) a starting point of "desktop search"? It's done on linux. If breaks on ms windows, please let me know or provide a patch.
It can be downloaded at http://nutch.neasys.com/patch/, together with a REQUIRED jar file jaf-1.0.2.jar. Check under 20040605. The jar file needs to be in ./nutch/lib. Next, I am going to work on codes for ftp:// scheme and parsers for PDF and MSDOC. Let me know if you've already worked on them. John --------------------------------- patch ---------------------------------------- diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' nutch-cvs-20040605/conf/nutch-default.xml nutch-cvs-20040605.xing/conf/nutch-default.xml --- nutch-cvs-20040605/conf/nutch-default.xml 2004-06-04 13:24:51.000000000 -0700 +++ nutch-cvs-20040605.xing/conf/nutch-default.xml 2004-06-04 23:32:28.000000000 -0700 @@ -83,6 +83,27 @@ <description>If true, HTTP will log more verbosely.</description> </property> +<!-- FILE properties --> + +<property> + <name>file.content.limit</name> + <value>65536</value> + <description>The length limit for downloaded content, in bytes. + If this value is larger than zero, content longer than it will be + truncated; otherwise (zero or negative), no truncation at all. + </description> +</property> + +<property> + <name>file.content.ignored</name> + <value>true</value> + <description>If true, no file content will be saved during fetch. + And it is probably what we want to set most of time, since file:// URLs + are meant to be local and we can always use them directly at parsing + and indexing stages. Otherwise file contents will be saved. + !! NO IMPLEMENTED YET !! + </description> +</property> <!-- FTP properties --> diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' nutch-cvs-20040605/src/plugin/build.xml nutch-cvs-20040605.xing/src/plugin/build.xml --- nutch-cvs-20040605/src/plugin/build.xml 2004-06-04 13:24:53.000000000 -0700 +++ nutch-cvs-20040605.xing/src/plugin/build.xml 2004-06-05 00:23:08.000000000 -0700 @@ -6,6 +6,7 @@ <!-- Build & deploy all the plugin jars. --> <!-- ====================================================== --> <target name="deploy"> + <ant dir="protocol-file" target="deploy"/> <ant dir="protocol-http" target="deploy"/> <ant dir="parse-html" target="deploy"/> <ant dir="parse-text" target="deploy"/> @@ -24,6 +25,7 @@ <!-- Clean all of the plugins. --> <!-- ====================================================== --> <target name="clean"> + <ant dir="protocol-file" target="clean"/> <ant dir="protocol-http" target="clean"/> <ant dir="parse-html" target="clean"/> <ant dir="parse-text" target="clean"/> diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' nutch-cvs-20040605/src/plugin/protocol-file/build.xml nutch-cvs-20040605.xing/src/plugin/protocol-file/build.xml --- nutch-cvs-20040605/src/plugin/protocol-file/build.xml 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040605.xing/src/plugin/protocol-file/build.xml 2004-06-03 22:19:48.000000000 -0700 @@ -0,0 +1,7 @@ +<?xml version="1.0"?> + +<project name="protocol-file" default="jar"> + + <import file="../build-plugin.xml"/> + +</project> diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' nutch-cvs-20040605/src/plugin/protocol-file/plugin.xml nutch-cvs-20040605.xing/src/plugin/protocol-file/plugin.xml --- nutch-cvs-20040605/src/plugin/protocol-file/plugin.xml 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040605.xing/src/plugin/protocol-file/plugin.xml 2004-06-05 09:24:09.000000000 -0700 @@ -0,0 +1,28 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="protocol-file" + name="File Protocol Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <extension-point + id="net.nutch.protocol.Protocol" + name="Nutch Protocol"/> + + <runtime> + <library name="protocol-file.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="net.nutch.protocol.file" + name="FileProtocol" + point="net.nutch.protocol.Protocol"> + + <implementation id="net.nutch.protocol.file.File" + class="net.nutch.protocol.file.File" + protocolName="file"/> + + </extension> + +</plugin> diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileError.java nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileError.java --- nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileError.java 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileError.java 2004-06-04 23:34:15.000000000 -0700 @@ -0,0 +1,19 @@ +/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.protocol.file; + +/** Thrown for File error codes. + */ +public class FileError extends FileException { + + private int code; + + public int getCode(int code) { return code; } + + public FileError(int code) { + super("File Error: " + code); + this.code = code; + } + +} diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileException.java nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileException.java --- nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileException.java 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileException.java 2004-06-04 09:46:04.000000000 -0700 @@ -0,0 +1,26 @@ +/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.protocol.file; + +import net.nutch.protocol.ProtocolException; + +public class FileException extends ProtocolException { + + public FileException() { + super(); + } + + public FileException(String message) { + super(message); + } + + public FileException(String message, Throwable cause) { + super(message, cause); + } + + public FileException(Throwable cause) { + super(cause); + } + +} diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/File.java nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/File.java --- nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/File.java 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/File.java 2004-06-04 16:25:25.000000000 -0700 @@ -0,0 +1,196 @@ +/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.protocol.file; + +import javax.activation.MimetypesFileTypeMap; +// 20040528, xing, disabled for now +//import xing.net.nutch.util.magicfile.*; + +import net.nutch.net.protocols.HttpDateFormat; + +import net.nutch.util.LogFormatter; +import net.nutch.util.NutchConf; + +import net.nutch.protocol.Content; +import net.nutch.protocol.Protocol; + +import java.util.logging.Level; +import java.util.logging.Logger; + +import java.net.URL; + +import java.io.InputStream; +// 20040528, xing, disabled for now +//import java.io.Reader; +import java.io.IOException; + +/************************************ + * File.java deals with file: scheme. + * + * Configurable parameters are defined under "FILE properties" section + * in ./conf/nutch-default.xml or similar. + * + * @author John Xing + ***********************************/ +public class File implements Protocol { + + public static final Logger LOG = + LogFormatter.getLogger("net.nutch.protocol.file.File"); + + static final int MAX_REDIRECTS = 5; + + static int maxContentLength = NutchConf.getInt("file.content.limit",64*1024); + + // 20040412, xing + // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile + // are placed in each thread before we check out if they're thread-safe. + + // http date format + HttpDateFormat httpDateFormat = null; + + // file name extension to mime-type map + static MimetypesFileTypeMap TYPE_MAP = null; + + static { + try { + // read mime types from config file + InputStream is = + NutchConf.getConfResourceAsInputStream + (NutchConf.get("mime.types.file")); + if (is == null) { + LOG.warning + ("no mime.types.file: won't use url extension for content-type."); + TYPE_MAP = null; + } else { + TYPE_MAP = new MimetypesFileTypeMap(is); + } + + if (is != null) + is.close(); + } catch (IOException e) { + LOG.log(Level.SEVERE, "Unexpected error", e); + } + } + +// 20040528, xing, disabled for now +// // file magic for determining content type +// static MagicFile MAGIC = null; +// +// static { +// try { +// // read file magic from config file +// Reader reader = +// NutchConf.getConfResourceAsReader +// (NutchConf.get("mime.magic.file")); +// if (reader == null) { +// LOG.warning +// ("no mime.magic.file: won't use file magic for content-type."); +// MAGIC = null; +// } else { +// MAGIC = MagicFile.getInstance(reader); +// } +// +// if (reader != null) +// reader.close(); +// } catch (IOException e) { +// LOG.log(Level.SEVERE, "Unexpected error", e); +// } +// } + + // constructor + public File() { + this.httpDateFormat = new HttpDateFormat(); + } + + /** Set the point at which content is truncated. */ + public void setMaxContentLength(int length) {this.maxContentLength = length;} + + public Content getContent(String urlString) throws FileException { + try { + URL url = new URL(urlString); + + int redirects = 0; + + while (true) { + FileResponse response; + response = new FileResponse(urlString, url, this); // make a request + + int code = response.getCode(); + + if (code == 200) { // got a good response + return response.toContent(); // return it + + } else if (code >= 300 && code < 400) { // handle redirect + if (redirects == MAX_REDIRECTS) + throw new FileException("Too many redirects: " + url); + url = new URL(response.getHeader("Location")); + redirects++; + if (LOG.isLoggable(Level.FINE)) + LOG.fine("redirect to " + url); + + } else { // convert to exception + throw new FileError(code); + } + } + } catch (IOException e) { + throw new FileException(e); + } + } + +// protected void finalize () { +// // nothing here +// } + + /** For debugging. */ + public static void main(String[] args) throws Exception { + int maxContentLength = Integer.MIN_VALUE; + String logLevel = "info"; + boolean dumpContent = false; + String urlString = null; + + String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-logLevel")) { + logLevel = args[++i]; + } else if (args[i].equals("-maxContentLength")) { + maxContentLength = Integer.parseInt(args[++i]); + } else if (args[i].equals("-dumpContent")) { + dumpContent = true; + } else if (i != args.length-1) { + System.err.println(usage); + System.exit(-1); + } else + urlString = args[i]; + } + + File file = new File(); + + if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength + file.setMaxContentLength(maxContentLength); + + // set log level + LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); + + Content content = file.getContent(urlString); + + System.err.println("Content-Type: " + + content.getContentType()); + System.err.println("Content-Length: " + + content.getMetaData().get("Content-Length")); + System.err.println("Last-Modified: " + + content.getMetaData().get("Last-Modified")); + if (dumpContent) { + System.out.print(new String(content.getContent())); + } + + file = null; + } + +} diff -Nur --exclude='*.txt' --exclude=nutch-site.xml --exclude='*.jar' nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileResponse.java nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileResponse.java --- nutch-cvs-20040605/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileResponse.java 1969-12-31 16:00:00.000000000 -0800 +++ nutch-cvs-20040605.xing/src/plugin/protocol-file/src/java/net/nutch/protocol/file/FileResponse.java 2004-06-04 16:23:19.000000000 -0700 @@ -0,0 +1,261 @@ +/* Copyright (c) 2004 The Nutch Organization. All rights reserved. */ +/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ + +package net.nutch.protocol.file; + +import javax.activation.MimetypesFileTypeMap; +// 20040528, xing, disabled for now +//import xing.net.nutch.util.magicfile.*; + +import net.nutch.protocol.Content; + +import java.net.URL; + +import java.util.TreeMap; +import java.util.Properties; + +import java.util.logging.Level; + +import java.io.InputStream; +import java.io.IOException; + +/************************************ + * FileResponse.java mimics file replies as http response. + * It tries its best to follow http's way for headers, response codes + * as well as exceptions. + * + * Comments: + * (1) java.net.URL and java.net.URLConnection can handle file: scheme. + * However they are not flexible enough, so not used in this implementation. + * + * (2) java.io.File is used for its abstractness across platforms. + * Warning: + * java.io.File API (1.4.2) does not elaborate on how special files, + * such as /dev/* in unix and /proc/* on linux, are treated. Tests show + * (a) java.io.File.isFile() return false for /dev/* + * (b) java.io.File.isFile() return true for /proc/* + * (c) java.io.File.length() return 0 for /proc/* + * We are probably oaky for now. Could be buggy here. + * How about special files on windows? + * + * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. + * They are just treated as individual files. + * + * (4) No funcy POSIX file attributes yet. May never need? + * + * @author John Xing + ***********************************/ +public class FileResponse { + private String orig; + private String base; + private byte[] content; + private int code; + private Properties headers = new Properties(); + + private final File file; + + /** Returns the response code. */ + public int getCode() { return code; } + + /** Returns the value of a named header. */ + public String getHeader(String name) { + return (String)headers.get(name); + } + + public byte[] getContent() { return content; } + + public Content toContent() { + return new Content(orig, base, content, + getHeader("Content-Type"), + headers); + } + + public FileResponse(URL url, File file) + throws FileException, IOException { + this(url.toString(), url, file); + } + + public FileResponse(String orig, URL url, File file) + throws FileException, IOException { + + this.orig = orig; + this.base = url.toString(); + this.file = file; + + if (!"file".equals(url.getProtocol())) + throw new FileException("Not a file url:" + url); + + if (File.LOG.isLoggable(Level.FINE)) + File.LOG.fine("fetching " + url); + + if (url.getPath() != url.getFile()) + File.LOG.warning("url.getPath() != url.getFile(): " + url); + + String path = "".equals(url.getPath()) ? "/" : url.getPath(); + + try { + + this.content = null; + + // url.toURI() is only in j2se 1.5.0 + //java.io.File f = new java.io.File(url.toURI()); + java.io.File f = new java.io.File(path); + + if (!f.exists()) { + this.code = 404; // http Not Found + return; + } + + if (!f.canRead()) { + this.code = 401; // http Unauthorized + return; + } + + // symbolic link or relative path on unix + // fix me: what's the consequence on windows platform + // where case is insensitive + if (!f.equals(f.getCanonicalFile())) { + // set headers + TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); + //hdrs.put("Location", f.getCanonicalFile().toURI()); + hdrs.put("Location", f.getCanonicalFile().toURL().toString()); + this.headers.putAll(hdrs); + + this.code = 300; // http redirect + return; + } + + if (f.isDirectory()) { + getDirAsHttpResponse(f); + } else if (f.isFile()) { + getFileAsHttpResponse(f); + } else { + this.code = 500; // http Internal Server Error + return; + } + + } catch (IOException e) { + throw e; + } + + } + + // get file as http response + private void getFileAsHttpResponse(java.io.File f) + throws FileException, IOException { + + // ignore file of size larger than + // Integer.MAX_VALUE = 2^31-1 = 2147483647 + long size = f.length(); + if (size > Integer.MAX_VALUE) { + throw new FileException("file is too large, size: "+size); + // or we can do this? + // this.code = 400; // http Bad request + // return; + } + + // capture content + int len = (int) size; + + if (this.file.maxContentLength > 0 && len > this.file.maxContentLength) + len = this.file.maxContentLength; + + this.content = new byte[len]; + + java.io.InputStream is = new java.io.FileInputStream(f); + int offset = 0; int n = 0; + while (offset < len + && (n = is.read(this.content, offset, len-offset)) >= 0) { + offset += n; + } + if (offset < len) // keep whatever already have, but issue a warning + File.LOG.warning("not enough bytes read from file: "+f.getPath()); + is.close(); + + // set headers + TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); + + hdrs.put("Content-Length", new Long(size).toString()); + + hdrs.put("Last-Modified", + this.file.httpDateFormat.toString(f.lastModified())); + + String contentType = null; +// 20040528, xing, disabled for now +// if (contentType == null && this.file.MAGIC != null) +// contentType = this.file.MAGIC.getMimeType(this.content); + if (contentType == null && this.file.TYPE_MAP != null) + contentType = this.file.TYPE_MAP.getContentType(f.getName()); + if (contentType != null) + hdrs.put("Content-Type", contentType); + + this.headers.putAll(hdrs); + + // response code + this.code = 200; // http OK + } + + // get dir list as http response + private void getDirAsHttpResponse(java.io.File f) + throws IOException { + + String path = f.toString(); + this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); + + // set headers + TreeMap hdrs = new TreeMap(String.CASE_INSENSITIVE_ORDER); + + hdrs.put("Content-Length", + new Integer(this.content.length).toString()); + + hdrs.put("Content-Type", "text/html"); + + hdrs.put("Last-Modified", + this.file.httpDateFormat.toString(f.lastModified())); + + this.headers.putAll(hdrs); + + // response code + this.code = 200; // http OK + } + + // generate html page from dir list + private byte[] list2html(java.io.File[] list, + String path, boolean includeDotDot) { + + StringBuffer x = new StringBuffer("<html><head>"); + x.append("<title>Index of "+path+"</title></head>\n"); + x.append("<body><h1>Index of "+path+"</h1><pre>\n"); + + if (includeDotDot) { + x.append("<a href='../'>../</a>\t-\t-\t-\n"); + } + + // fix me: we might want to sort list here! but not now. + + java.io.File f; + for (int i=0; i<list.length; i++) { + f = list[i]; + String name = f.getName(); + String time = this.file.httpDateFormat.toString(f.lastModified()); + if (f.isDirectory()) { + // java 1.4.2 api says dir itself and parent dir are not listed + // so the following is not needed. + //if (name.equals(".") || name.equals("..")) + // continue; + x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t"); + x.append(time+"\t-\n"); + } else if (f.isFile()) { + x.append("<a href='"+name+ "'>"+name+"</a>\t"); + x.append(time+"\t"+f.length()+"\n"); + } else { + // ignore any other + } + } + + x.append("</pre></body></html>\n"); + + return new String(x).getBytes(); + } + +} ------------------------------------------------------- This SF.Net email is sponsored by the new InstallShield X. >From Windows to Linux, servers to mobile, InstallShield X is the one installation-authoring solution that does it all. Learn more and evaluate today! http://www.installshield.com/Dev2Dev/0504 _______________________________________________ Nutch-developers mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-developers
