Update of /cvsroot/nutch/playground/src/java/net/nutch/net
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10313/src/java/net/nutch/net
Added Files:
RegexURLFilter.java URLFilter.java Http.java
URLFilterFactory.java UrlNormalizer.java
Log Message:
intial commit
--- NEW FILE: RegexURLFilter.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net;
import java.io.Reader;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Pattern;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.MalformedPatternException;
import net.nutch.util.*;
/** Filters URLs based on a file of regular expressions. The config file is
* named by the Nutch configuration property "urlfilter.regex.file".
*
* <p>The format of this file is:
* <pre>
* [+-]<regex>
* </pre>
* where plus means go ahead and index it and minus means no.
*/
public class RegexURLFilter implements URLFilter {
private static class Rule {
public Perl5Pattern pattern;
public boolean sign;
public String regex;
}
private List rules;
private PatternMatcher matcher = new Perl5Matcher();
public RegexURLFilter() throws IOException, MalformedPatternException {
String file = NutchConf.get("urlfilter.regex.file");
Reader reader = NutchConf.getConfResourceAsReader(file);
rules=readConfigurationFile(reader);
}
public RegexURLFilter(String filename)
throws IOException, MalformedPatternException {
rules = readConfigurationFile(new FileReader(filename));
}
public synchronized String filter(String url) {
Iterator i=rules.iterator();
while(i.hasNext()) {
Rule r=(Rule) i.next();
if (matcher.contains(url,r.pattern)) {
//System.out.println("Matched " + r.regex);
return r.sign ? url : null;
}
};
return null; // assume no go
}
//
// Format of configuration file is
//
// [+-]<regex>
//
// where plus means go ahead and index it and minus means no.
//
private static List readConfigurationFile(Reader reader)
throws IOException, MalformedPatternException {
BufferedReader in=new BufferedReader(reader);
Perl5Compiler compiler=new Perl5Compiler();
List rules=new ArrayList();
String line;
while((line=in.readLine())!=null) {
if (line.length() == 0)
continue;
char first=line.charAt(0);
boolean sign=false;
switch (first) {
case '+' :
sign=true;
break;
case '-' :
sign=false;
break;
case ' ' : case '\n' : case '#' : // skip blank & comment lines
continue;
default :
throw new IOException("Invalid first character: "+line);
}
String regex=line.substring(1);
Rule rule=new Rule();
rule.pattern=(Perl5Pattern) compiler.compile(regex);
rule.sign=sign;
rule.regex=regex;
rules.add(rule);
}
return rules;
}
public static void main(String args[])
throws IOException, MalformedPatternException {
RegexURLFilter filter=new RegexURLFilter();
BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
String line;
while((line=in.readLine())!=null) {
String out=filter.filter(line);
if(out!=null) {
System.out.print("+");
System.out.println(out);
} else {
System.out.print("-");
System.out.println(line);
}
}
}
}
--- NEW FILE: URLFilter.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net;
/** Interface used to limit which URLs enter Nutch. Used by the injector and
* the db updater.*/
public interface URLFilter {
/* Interface for a filter that transforms a URL: it can pass the
original URL through or "delete" the URL by returning null */
public String filter(String url);
}
--- NEW FILE: Http.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net;
import java.io.IOException;
import java.io.EOFException;
import java.io.BufferedInputStream;
import java.io.PushbackInputStream;
import java.io.InputStream;
import java.io.ByteArrayOutputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.Socket;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.logging.Handler;
import net.nutch.util.LogFormatter;
import net.nutch.util.GZIPUtils;
import net.nutch.util.NutchConf;
import net.nutch.fetcher.FetcherStatus;
import net.nutch.fetcher.FetcherConstants;
/** A simple HTTP client. */
public class Http {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.net.Http");
private static final int DEFAULT_PORT = 80;
private static final int CODE_OK = 200;
private static final int BUFFER_SIZE = 16384;
private static final int MAX_REDIRECTS = 5;
/** Reserved value for HTTP version number, does not denote any version */
public static final int HTTP_VER_NOTSET= -1;
/** HTTP version 1.0 (the earliest version we use) */
public static final int HTTP_VER_1_0= 0;
/** HTTP version 1.1 */
public static final int HTTP_VER_1_1= 1;
/** Always indicates the latest HTTP version we support, currently 1.1 */
public static final int HTTP_VER_LATEST;
private int timeout = NutchConf.getInt("http.timeout", 10000);
private int maxContentLength= NutchConf.getInt("http.content.limit",64*1024);
private String agentString= null;
private String agentEmail = NutchConf.get("http.agent.email");
static {
if (NutchConf.getBoolean("http.version.1.1", true))
HTTP_VER_LATEST= HTTP_VER_1_1;
else
HTTP_VER_LATEST= HTTP_VER_1_0;
}
/**
* Returns the HTTP version code which represents a lesser version
* of HTTP, or HTTP_VER_NOTSET if both equal that value.
*/
public static int minHttpVersion(int ver1, int ver2) {
if (ver1 < ver2) {
if (ver1 == HTTP_VER_NOTSET)
return ver2;
return ver1;
}
if (ver2 == HTTP_VER_NOTSET)
return ver1;
return ver2;
}
/** Superclass for non-IO exceptions thrown during HTTP requests
* or parsing of responses
*/
public abstract class HttpException extends Exception {
HttpException(String msg) {
super(msg);
}
HttpException(String msg, Throwable cause) {
super(msg, cause);
}
}
/** Exception indicating that the status line could not be parsed. */
public class BadStatusLineException extends HttpException {
BadStatusLineException(String msg, Throwable cause) {
super(msg, cause);
}
}
/** Exception indicating that a header line could not be parsed. */
public class BadHeaderLineException extends HttpException {
BadHeaderLineException(String msg) {
super(msg);
}
}
/**
* Exception indicating that the Content-Length header could not be
* parsed.
*/
public class ContentLengthParseException extends HttpException {
ContentLengthParseException(String msg) {
super(msg);
}
}
/**
* Superclass for exceptions which indicate errors specific to
* HTTP/1.1 (when these are caught, future attempts to fetch this page
* should consider falling back to 1.0).
*/
public abstract class Http11Exception extends HttpException {
Http11Exception(String msg) {
super(msg);
}
Http11Exception(String msg, Throwable cause) {
super(msg, cause);
}
}
/**
* Exception indicating that a Chunk-Length could not be parsed
* as a hex integer.
*/
public class ChunkLengthParseException extends Http11Exception {
ChunkLengthParseException(String msg) {
super(msg);
}
}
/**
* Exception indicating that the server closed the socket in
* mid-chunk.
*/
public class ChunkEOFException extends Http11Exception {
ChunkEOFException(String msg) {
super(msg);
}
}
/**
* Exception indicating that some data was recieved, but could not
* be decompressed.
*/
public class DecompressionException extends Http11Exception {
DecompressionException(String msg) {
super(msg);
}
}
/**
* Simple container for raw byte counts (sent and recieved) and
* HTTP version info, which can optionally be used for accounting.
* Also holds a cached InetAddress, so user can cache an address.
*/
public static class MiscHttpAccounting {
private long bytesRead= 0;
private long bytesSent= 0;
private int httpVersion= HTTP_VER_NOTSET;
private InetAddress addr;
public MiscHttpAccounting() {
bytesRead= 0;
bytesSent= 0;
httpVersion= HTTP_VER_NOTSET;
addr= null;
}
public long getBytesRead() {
return bytesRead;
}
public void incrementBytesRead(long incr) {
bytesRead+= incr;
}
public long getBytesSent() {
return bytesSent;
}
public void incrementBytesSent(long incr) {
bytesSent+= incr;
}
public int getServHttpVersion() {
return httpVersion;
}
public void setServHttpVersion(int httpVersion) {
this.httpVersion= httpVersion;
}
public InetAddress getAddr() {
return addr;
}
public void setAddr(InetAddress addr) {
this.addr= addr;
}
public void reset() {
bytesRead= 0;
bytesSent= 0;
httpVersion= HTTP_VER_NOTSET;
addr= null;
}
}
/** An HTTP response. */
public class Response {
private int code;
private int numContinues;
private Map headers;
private byte[] content;
private byte[] compressedContent;
MiscHttpAccounting httpAccounting;
/** Returns the response code. */
public int getCode() { return code; }
/** Returns the value of a named header. */
public String getHeader(String name) { return (String)headers.get(name); }
/** Returns the full content of the response. */
public byte[] getContent() { return content; }
/**
* Returns the compressed version of the content if the server
* transmitted a compressed version, or <code>null</code>
* otherwise.
*/
public byte[] getCompressedContent() {
return compressedContent;
}
/**
* Returns the number of 100/Continue headers encountered
*/
public int getNumContinues() {
return numContinues;
}
private Response(URL url)
throws IOException, HttpException {
this(url, null, null, HTTP_VER_LATEST);
}
private Response(URL url, InetAddress addr,
MiscHttpAccounting httpAccounting,
int httpVersion)
throws IOException, HttpException {
this.httpAccounting= httpAccounting;
if (!"http".equals(url.getProtocol()))
throw new IOException("Not an HTTP url:" + url);
if ( (httpVersion < 0) || (httpVersion > HTTP_VER_LATEST) )
httpVersion= HTTP_VER_LATEST;
if (LOG.isLoggable(Level.FINE))
LOG.fine("fetching " + url);
String path = "".equals(url.getFile()) ? "/" : url.getFile();
// some servers will redirect a request with a host line like
// "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
// don't want the :80...
int port;
String portString;
if (url.getPort() == -1) {
port= 80;
portString= "";
} else {
port= url.getPort();
portString= ":" + port;
}
Socket socket = null;
try {
socket = new Socket(); // create the socket
socket.setSoTimeout(timeout);
if (addr == null) {
addr= InetAddress.getByName(url.getHost());
if (httpAccounting != null)
httpAccounting.setAddr(addr);
}
// connect
InetSocketAddress sockAddr= new InetSocketAddress(addr, port);
socket.connect(sockAddr, timeout);
OutputStream req = socket.getOutputStream(); // make request
StringBuffer reqStr = new StringBuffer("GET ");
reqStr.append(path);
if (httpVersion == HTTP_VER_1_1)
reqStr.append(" HTTP/1.1\r\n");
else
reqStr.append(" HTTP/1.0\r\n");
reqStr.append("Host: ");
reqStr.append(url.getHost());
reqStr.append(portString);
reqStr.append("\r\n");
if (httpVersion == HTTP_VER_1_1) {
reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
reqStr.append("Connection: close\r\n");
}
if ((agentString == null) || (agentString.length() == 0)) {
LOG.severe("User-agent is not set!");
} else {
reqStr.append("User-Agent: ");
reqStr.append(agentString);
reqStr.append("\r\n");
}
reqStr.append("\r\n");
byte[] reqBytes= reqStr.toString().getBytes();
if (httpAccounting != null)
httpAccounting.incrementBytesSent(reqBytes.length);
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
new PushbackInputStream(
new BufferedInputStream(socket.getInputStream(), BUFFER_SIZE),
BUFFER_SIZE) ;
StringBuffer line = new StringBuffer();
numContinues= -1;
boolean haveSeenNonContinueStatus= false;
while (!haveSeenNonContinueStatus) {
numContinues++;
// parse status code line
this.code = parseStatusLine(in, line);
// parse headers
this.headers = parseHeaders(in, line);
haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
}
String transferCoding= getHeader("Transfer-Encoding");
if ("chunked".equals(transferCoding)) {
LOG.fine("fetching chunked!");
try {
readChunkedContent(in, line);
} catch (EOFException e) {
throw new ChunkEOFException("");
}
} else {
LOG.fine("fetching plain!");
readPlainContent(in);
}
String contentEncoding= getHeader("Content-Encoding");
if ("gzip".equals(contentEncoding)
|| "x-gzip".equals(contentEncoding)) {
LOG.fine("uncompressing....");
compressedContent= content;
FetcherStatus.logTraceMisc(FetcherConstants.MISC_INFORMATIONAL,
"about to decompress: " + url);
content= GZIPUtils.unzipBestEffort(compressedContent,
maxContentLength);
if (content == null)
throw new DecompressionException("unzipBestEffort returned null");
if (LOG.isLoggable(Level.FINE))
LOG.fine("fetched " + compressedContent.length
+ " bytes of compressed content (expanded to "
+ content.length + " bytes) from " + url);
} else {
if (LOG.isLoggable(Level.FINE))
LOG.fine("fetched " + content.length + " bytes from " + url);
}
} finally {
if (socket != null)
socket.close();
}
}
private void readPlainContent(InputStream in)
throws HttpException, IOException {
int contentLength = Integer.MAX_VALUE; // get content length
String contentLengthString = (String)headers.get("Content-Length");
if (contentLengthString != null) {
contentLengthString = contentLengthString.trim();
try {
contentLength = Integer.parseInt(contentLengthString);
} catch (NumberFormatException e) {
throw new ContentLengthParseException(contentLengthString);
}
}
if (contentLength > maxContentLength) // limit download size
contentLength = maxContentLength;
ByteArrayOutputStream out = new ByteArrayOutputStream(BUFFER_SIZE);
byte[] bytes = new byte[BUFFER_SIZE];
int length = 0; // read content
for (int i = in.read(bytes); i != -1; i = in.read(bytes)) {
if (httpAccounting != null)
httpAccounting.incrementBytesRead(i);
out.write(bytes, 0, i);
length += i;
if (length >= contentLength)
break;
}
this.content = out.toByteArray();
}
private void readChunkedContent(PushbackInputStream in,
StringBuffer line)
throws HttpException, IOException {
boolean doneChunks= false;
int contentBytesRead= 0;
byte[] bytes = new byte[BUFFER_SIZE];
ByteArrayOutputStream out = new ByteArrayOutputStream(BUFFER_SIZE);
while (!doneChunks) {
LOG.fine("Http: starting chunk");
readLine(in, line, false);
if (httpAccounting != null)
httpAccounting.incrementBytesRead(line.length());
String chunkLenStr;
// LOG.fine("chunk-header: '" + line + "'");
int pos= line.indexOf(";");
if (pos < 0) {
chunkLenStr= line.toString();
} else {
chunkLenStr= line.substring(0, pos);
// LOG.fine("got chunk-ext: " + line.substring(pos+1));
}
chunkLenStr= chunkLenStr.trim();
int chunkLen;
try {
chunkLen= Integer.parseInt(chunkLenStr, 16);
} catch (NumberFormatException e){
throw new ContentLengthParseException(line.toString());
}
if (chunkLen == 0) {
doneChunks= true;
break;
}
if ( (contentBytesRead + chunkLen) > maxContentLength )
chunkLen= maxContentLength - contentBytesRead;
// read one chunk
int chunkBytesRead= 0;
while (chunkBytesRead < chunkLen) {
int toRead= (chunkLen - chunkBytesRead) < BUFFER_SIZE ?
(chunkLen - chunkBytesRead) : BUFFER_SIZE;
int len= in.read(bytes, 0, toRead);
if (len == -1)
throw new ChunkEOFException("after " + contentBytesRead
+ " bytes in successful chunks"
+ " and " + chunkBytesRead
+ " in current chunk");
// DANGER!!! Will printed GZIPed stuff right to your
// terminal!
// LOG.fine("read: " + new String(bytes, 0, len));
if (httpAccounting != null)
httpAccounting.incrementBytesRead(len);
out.write(bytes, 0, len);
chunkBytesRead+= len;
}
readLine(in, line, false);
if (httpAccounting != null)
httpAccounting.incrementBytesRead(line.length());
}
if (!doneChunks) {
if (contentBytesRead != maxContentLength)
throw new ChunkEOFException("!doneChunk && didn't max out");
return;
}
this.content= out.toByteArray();
parseHeaders(in, line);
}
private int parseStatusLine(PushbackInputStream in, StringBuffer line)
throws IOException, HttpException {
readLine(in, line, false);
// approximate bytes by chars- should be right for HTTP
if (httpAccounting != null)
httpAccounting.incrementBytesRead(line.length());
int codeStart = line.indexOf(" ");
int codeEnd = line.indexOf(" ", codeStart+1);
// handle lines with no plaintext result code, ie:
// "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
if (codeEnd == -1)
codeEnd= line.length();
int code;
try {
code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
} catch (NumberFormatException e) {
throw new BadStatusLineException("bad status line '" + line
+ "': " + e.getMessage(), e);
}
int versionCode= -1;
int servVersionCode= HTTP_VER_NOTSET;
try {
int httpMajorVer= 0;
int httpMinorVer= 0;
if (line.toString().startsWith("HTTP/")) {
int dotPos= line.indexOf(".");
httpMajorVer= Integer.parseInt( line.substring(5, dotPos) );
httpMinorVer= Integer.parseInt( line.substring(dotPos+1, codeStart) );
if (httpMajorVer == 1) {
if (httpMinorVer < 1)
versionCode= HTTP_VER_1_0;
else
versionCode= HTTP_VER_1_1;
}
}
} catch (NumberFormatException e) {
;
}
if (versionCode == HTTP_VER_NOTSET) // bogus, always fall back
servVersionCode= HTTP_VER_1_0;
if (httpAccounting != null) {
httpAccounting.setServHttpVersion(servVersionCode);
}
return code;
}
private void processHeaderLine(StringBuffer line, TreeMap headers)
throws IOException, HttpException {
int colonIndex = line.indexOf(":"); // key is up to colon
if (colonIndex == -1) {
int i;
for (i= 0; i < line.length(); i++)
if (!Character.isWhitespace(line.charAt(i)))
break;
if (i == line.length())
return;
throw new BadHeaderLineException("No colon in header:" + line);
}
String key = line.substring(0, colonIndex);
int valueStart = colonIndex+1; // skip whitespace
while (valueStart < line.length()) {
int c = line.charAt(valueStart);
if (c != ' ' && c != '\t')
break;
valueStart++;
}
String value = line.substring(valueStart);
headers.put(key, value);
}
private Map parseHeaders(PushbackInputStream in, StringBuffer line)
throws IOException, HttpException {
TreeMap headers = new TreeMap(String.CASE_INSENSITIVE_ORDER);
return parseHeaders(in, line, headers);
}
// Adds headers to an existing TreeMap
private Map parseHeaders(PushbackInputStream in, StringBuffer line,
TreeMap headers)
throws IOException, HttpException {
while (readLine(in, line, true) != 0) {
// handle HTTP responses with missing blank line after headers
int pos;
if ( ((pos= line.indexOf("<!DOCTYPE")) != -1)
|| ((pos= line.indexOf("<HTML")) != -1)
|| ((pos= line.indexOf("<html")) != -1) ) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
// approximate bytes by chars- should be right for HTTP
if (httpAccounting != null)
httpAccounting.incrementBytesRead(pos);
try {
processHeaderLine(line, headers);
} catch (Exception e) {
// fixme:
e.printStackTrace();
}
return headers;
}
// approximate bytes by chars- should be right for HTTP
if (httpAccounting != null)
httpAccounting.incrementBytesRead(line.length());
processHeaderLine(line, headers);
}
return headers;
}
}
/** Set the timeout. */
public void setTimeout(int timeout) {this.timeout = timeout;}
/** Set the point at which content is truncated. */
public void setMaxContentLength(int length) {this.maxContentLength = length;}
/** Set the agent name */
public void setAgentString(String agentString) {
this.agentString = agentString;
}
/** set the return email address */
public void setAgentEmail(String agentEmail) {this.agentEmail = agentEmail;}
/**
* Make a single HTTP request and return its response, not following
* redirects and not translating HTTP errors to exceptions. If
* <code>addr</code> is not null, that address will be used. If
* <code>httpAccounting</code> is not <code>null</code>, the it's
* fields will be upated during this request. The request will be issued
* using the HTTP version specified by <code>httpVersion</code>.
*/
public Response getRawResponse(URL url, InetAddress addr,
MiscHttpAccounting httpAccounting,
int httpVersion)
throws IOException, HttpException {
return new Response(url, addr, httpAccounting, httpVersion);
}
/** Returns the content of a URL. Follow redirects and translate HTTP errors
* to exceptions. */
public Response getResponse(URL url) throws IOException, HttpException {
int redirects = 0;
URL target = url;
while (true) {
Response response = new Response(target); // make a request
int code = response.getCode();
if (code == 200) { // got a good response
return response; // return it
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
throw new IOException("Too many redirects: " + url);
target = new URL(response.getHeader("Location"));
redirects++;
LOG.fine("redirect to " + target);
} else { // convert to exception
throw new IOException("HTTP Error: " + code);
}
}
}
private static int readLine(PushbackInputStream in, StringBuffer line,
boolean allowContinuedLine)
throws IOException {
line.setLength(0);
for (int c = in.read(); c != -1; c = in.read()) {
switch (c) {
case '\r':
if (peek(in) == '\n') {
in.read();
}
case '\n':
if (line.length() > 0) {
// at EOL -- check for continued line if the current
// (possibly continued) line wasn't blank
if (allowContinuedLine)
switch (peek(in)) {
case ' ' : case '\t': // line is continued
in.read();
continue;
}
}
return line.length(); // else complete
default :
line.append((char)c);
}
}
throw new EOFException();
}
private static int peek(PushbackInputStream in) throws IOException {
int value = in.read();
in.unread(value);
return value;
}
/** For debugging. */
public static void main(String[] args) throws Exception {
int timeout = -1;
boolean verbose = false;
String urlString = null;
String usage = "Usage: Http [-verbose] [-timeout N] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-timeout")) { // found -timeout option
timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-verbose")) { // found -verbose option
verbose = true;
} else if (i != args.length-1) {
System.err.println(usage);
System.exit(-1);
} else // root is required parameter
urlString = args[i];
}
Http http = new Http();
if (timeout != -1) // set timeout
http.setTimeout(timeout);
// set log level
if (verbose) {
LOG.setLevel(Level.FINE);
}
Response response = http.getResponse(new URL(urlString));
System.out.println("Code = " + response.getCode());
System.out.println("Content:");
String content = new String(response.getContent());
System.out.println(content);
}
}
--- NEW FILE: URLFilterFactory.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net;
import net.nutch.util.*;
import java.util.logging.*;
/** Factory to create a URLFilter from "urlfilter.class" config property. */
public class URLFilterFactory {
private static final Logger LOG =
LogFormatter.getLogger("net.nutch.net.URLFilterFactory");
private static final String URLFILTER_CLASS =
NutchConf.get("urlfilter.class");
private URLFilterFactory() {} // no public ctor
private static URLFilter filter;
/** Return the default URLFilter implementation. */
public static URLFilter getFilter() {
if (filter == null) {
try {
LOG.info("Using URL filter: " + URLFILTER_CLASS);
Class filterClass = Class.forName(URLFILTER_CLASS);
filter = (URLFilter)filterClass.newInstance();
} catch (Exception e) {
throw new RuntimeException("Couldn't create "+URLFILTER_CLASS, e);
}
}
return filter;
}
}
--- NEW FILE: UrlNormalizer.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net;
import java.net.URL;
import java.net.MalformedURLException;
// import java.net.URI;
// import java.net.URISyntaxException;
import java.util.logging.Logger;
import net.nutch.util.LogFormatter;
/** Converts URLs to a normal form . */
public class UrlNormalizer {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.net.UrlNormalizer");
public static String normalize(String urlString)
throws MalformedURLException {
if ("".equals(urlString)) // permit empty
return urlString;
urlString = urlString.trim(); // remove extra spaces
URL url = new URL(urlString);
String protocol = url.getProtocol();
String host = url.getHost();
int port = url.getPort();
String file = url.getFile();
boolean changed = false;
if (!urlString.startsWith(protocol)) // protocol was lowercased
changed = true;
if ("http".equals(protocol) || "ftp".equals(protocol)) {
if (host != null) {
String newHost = host.toLowerCase(); // lowercase host
if (!host.equals(newHost)) {
host = newHost;
changed = true;
}
}
if (port == url.getDefaultPort()) { // uses default port
port = -1; // so don't specify it
changed = true;
}
if (file == null || "".equals(file)) { // add a slash
file = "/";
changed = true;
}
}
if (changed)
urlString = new URL(protocol, host, port, file).toString();
return urlString;
}
}
-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs