Hi,

I've just written an protocol-smb, it's really simple (code attached). It uses the jcifs lib and seems to work - but there is some stuff I'd like to discuss...

Nutch is glued to URL, which works if you write an URLHandler. No Problem so far, but you can't install an URLHandler everywhere - have a look at the jcifs FAQ ( http://jcifs.samba.org/src/docs/faq.html ). Most important: It won't work in you war - so protocol plugins will be useless in a web context! Might cause a lot of trouble. Moreover Nutch will never be able to handle \\192.168.0.1\ correctly with URL....

Converting directories into html lists suck. And reproducing the code is even worse. Perhaps a virtual mime-type could be added (e.g. "nutch/dir"). Almost forgotten: tell my how I should index files with " and ' in there name (currently I check for ' and change the href quotes). Same problem for file://

Most protocols are not mime-type aware (e.g. file:// - indexed my mp3 collection with the text parser, great fun!). I've added a simple mime-type guess, but this shouldn't be part of the protocol handler.

Anyway, feel free to use the smb code, it's rather simple/basic.
There is still a multithreading issue left :( but the very basic crawling process seems to works (-threads 1). I've not yet tested the generated index (= I've not yet indexed my hd and I've not yet tried to search)

I've added the apache header, hope this is ok.
/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.protocol.smb;

import java.io.DataInputStream;
import java.io.IOException;
import java.util.regex.Pattern;

import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypes;

/**
 * Smb.java handles smb data access.
 * 
 * @author treffer
 */
public class Smb implements Protocol {

	static {
		try {
			jcifs.Config.registerSmbURLHandler();
		} catch (Throwable t) {}
	}

	private static Pattern nonTopLevel = Pattern.compile("smb://[^/]+/.+");

	//Don't download anything unless specified!
	private int maxContentLength;

	public static final Log LOG = LogFactory.getLog(Smb.class);

	private Configuration conf;

	private Metadata header = new Metadata();

	private MimeTypes mimeTypes;

	public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
		System.out.println(url.toString());
		/*
		 * Most of the code here was "inspired" from FtpResponse
		 */
		try {
			String urlString = url.toString();
			//BTW: how can I handle non smb: urls?
			urlString.replace('\\', '/');
			if (!urlString.startsWith("smb:")) {
				urlString = (
						urlString.startsWith("//")
						? "smb:"
						: ((urlString.startsWith("/"))
							? "smb:/"
							: "smb://")
						)
					+ urlString;
			}
			SmbFile smbFile = new SmbFile(urlString);
			byte data[];
			if (!smbFile.isFile()) {
				data = dir2html(smbFile, urlString);
			} else {
				data = file2html(smbFile, urlString);
			}
			return new ProtocolOutput(new Content(urlString, urlString, data, header.get(Response.CONTENT_TYPE), header, conf));
		} catch (Exception e) {
			return new ProtocolOutput(null, new ProtocolStatus(e));
		}
	}

	private byte[] dir2html(SmbFile smbFile, String urlString) throws SmbException {
		header.set(Response.LAST_MODIFIED, Long.toString(smbFile.getLastModified()));

		SmbFile[] subs = smbFile.listFiles();
		//Try to guess buffer size
		header.set(Response.CONTENT_TYPE, "text/html");

		int bufferSize = subs.length * (urlString.length() + 100) + 2048;
		StringBuffer sb = new StringBuffer(bufferSize);
		sb.append("<html><head>");
		sb.append("<title>Index of "+smbFile.toString()+"</title></head>\n");
		sb.append("<body><h1>Index of "+smbFile.toString()+"</h1><pre>\n");
		if (!smbFile.getParent().equals("smb://")) {
			//Not toplevel, Host + Directory => reference to parent directory
		    sb.append("<a href=");
		    sb.append(quote(smbFile.getParent()));
		    sb.append(">");
		    sb.append(smbFile.getParent());
		    sb.append("</a>\t-\t-\t-\n");
		}
		for (int i = 0; i < subs.length; i++) {
			String v = subs[i].toString();
	        sb.append("<a href=");
	        sb.append(quote(v));
	        sb.append(">");
	        sb.append(v);
		sb.append("</a>\t");
	        sb.append(HttpDateFormat.toString(subs[i].getLastModified()));
	        if (subs[i].isFile()) {
		        sb.append("\t-\n");
	        } else {
		        sb.append("\t");
		        sb.append(subs[i].getContentLength());
		        sb.append("\n");
	        }
		}
	    sb.append("</pre></body></html>\n");
	    LOG.info(Double.toString(sb.length() / (double)bufferSize));
	    byte data[] = sb.toString().getBytes();
	    header.set(Response.CONTENT_LENGTH, Integer.toString(data.length));
	    return data;
	}

	private byte[] file2html(SmbFile smbFile, String urlString) throws IOException {
		int contentLength = Math.min(maxContentLength, smbFile.getContentLength());
		header.set(Response.CONTENT_LENGTH, Integer.toString(contentLength));
		header.set(Response.LAST_MODIFIED, Long.toString(smbFile.getLastModified()));
		byte data[] = new byte[contentLength];
		DataInputStream dis = new DataInputStream(smbFile.getInputStream());
		dis.readFully(data);
		dis.close();
		urlString = urlString.substring(urlString.lastIndexOf('/') + 1);
		MimeType typeA = mimeTypes.getMimeType(urlString);
		MimeType typeB = mimeTypes.getMimeType(data);
		if (typeA == null) {
			if (typeB == null) {
				LOG.warn("No MimeType was found for " + urlString);
			} else {
				header.set(Response.CONTENT_TYPE, typeB.getName());
			}
		} else {
			if (typeB == null) {
				header.set(Response.CONTENT_TYPE, typeA.getName());
			} else {
				if (!typeA.equals(typeB)) {
					LOG.warn("MimeType missmatch, choosing " + typeB.getName() + " instead of " + typeA.getName());
				}
				header.set(Response.CONTENT_TYPE, typeB.getName());
			}
		}
		return data;
	}

	public String quote(String url) {
		return (url.indexOf('\'') != -1) ? "\"" + url + "\"" : '\'' + url + '\'';
	}

	public Configuration getConf() {
		return conf;
	}

	public void setConf(Configuration conf) {
		this.conf = conf;
	    this.maxContentLength = conf.getInt("smb.content.limit", 64 * 1024);
	    mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
	}

}

Reply via email to