Update of /cvsroot/nutch/playground/src/java/net/nutch/net/protocols/http
In directory
sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10313/src/java/net/nutch/net/protocols/http
Added Files:
HttpException.java ContentLengthParseException.java
BadHeaderLineException.java Http.java Response.java
DecompressionException.java MiscHttpAccounting.java
ChunkLengthParseException.java HttpVersionException.java
BadStatusLineException.java ChunkEOFException.java
Log Message:
intial commit
--- NEW FILE: HttpException.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
import net.nutch.net.protocols.ProtocolException;
/** Superclass for non-IO exceptions thrown during HTTP requests
* or parsing of responses
*/
public class HttpException extends ProtocolException {
public HttpException() {
super();
}
public HttpException(String message) {
super(message);
}
public HttpException(String message, Throwable cause) {
super(message, cause);
}
public HttpException(Throwable cause) {
super(cause);
}
}
--- NEW FILE: ContentLengthParseException.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
/**
* Exception indicating that the Content-Length header could not be
* parsed.
*/
public class ContentLengthParseException extends HttpException {
ContentLengthParseException(String msg) {
super(msg);
}
}
--- NEW FILE: BadHeaderLineException.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
/** Exception indicating that a header line could not be parsed. */
public class BadHeaderLineException extends HttpException {
BadHeaderLineException(String msg) {
super(msg);
}
}
--- NEW FILE: Http.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
import java.io.EOFException;
import java.io.IOException;
import java.io.PushbackInputStream;
import java.net.InetAddress;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import net.nutch.util.LogFormatter;
import net.nutch.util.NutchConf;
/** A simple HTTP client. */
public class Http {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.net.Http");
private static final int DEFAULT_PORT = 80;
private static final int CODE_OK = 200;
static final int BUFFER_SIZE = 16384;
private static final int MAX_REDIRECTS = 5;
/** Reserved value for HTTP version number, does not denote any version */
public static final int HTTP_VER_NOTSET= -1;
/** HTTP version 1.0 (the earliest version we use) */
public static final int HTTP_VER_1_0= 0;
/** HTTP version 1.1 */
public static final int HTTP_VER_1_1= 1;
/** Always indicates the latest HTTP version we support, currently 1.1 */
public static final int HTTP_VER_LATEST;
int timeout = NutchConf.getInt("http.timeout", 10000);
int maxContentLength= NutchConf.getInt("http.content.limit",64*1024);
String agentString= null;
private String agentEmail = NutchConf.get("http.agent.email");
static {
if (NutchConf.getBoolean("http.version.1.1", true))
HTTP_VER_LATEST= HTTP_VER_1_1;
else
HTTP_VER_LATEST= HTTP_VER_1_0;
}
/**
* Returns the HTTP version code which represents a lesser version
* of HTTP, or HTTP_VER_NOTSET if both equal that value.
*/
public static int minHttpVersion(int ver1, int ver2) {
if (ver1 < ver2) {
if (ver1 == HTTP_VER_NOTSET)
return ver2;
return ver1;
}
if (ver2 == HTTP_VER_NOTSET)
return ver1;
return ver2;
}
/** Set the timeout. */
public void setTimeout(int timeout) {this.timeout = timeout;}
/** Set the point at which content is truncated. */
public void setMaxContentLength(int length) {this.maxContentLength = length;}
/** Set the agent name */
public void setAgentString(String agentString) {
this.agentString = agentString;
}
/** set the return email address */
public void setAgentEmail(String agentEmail) {this.agentEmail = agentEmail;}
/**
* Make a single HTTP request and return its response, not following
* redirects and not translating HTTP errors to exceptions. If
* <code>addr</code> is not null, that address will be used. If
* <code>httpAccounting</code> is not <code>null</code>, the it's
* fields will be upated during this request. The request will be issued
* using the HTTP version specified by <code>httpVersion</code>.
*/
public Response getRawResponse(URL url, InetAddress addr,
MiscHttpAccounting httpAccounting,
int httpVersion)
throws IOException, HttpException {
return new Response(this, url, addr, httpAccounting, httpVersion);
}
/** Returns the content of a URL. Follow redirects and translate HTTP errors
* to exceptions. */
public Response getResponse(URL url) throws IOException, HttpException {
int redirects = 0;
URL target = url;
while (true) {
Response response = new Response(this, target); // make a request
int code = response.getCode();
if (code == 200) { // got a good response
return response; // return it
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
throw new IOException("Too many redirects: " + url);
target = new URL(response.getHeader("Location"));
redirects++;
LOG.fine("redirect to " + target);
} else { // convert to exception
throw new IOException("HTTP Error: " + code);
}
}
}
static int readLine(PushbackInputStream in, StringBuffer line,
boolean allowContinuedLine)
throws IOException {
line.setLength(0);
for (int c = in.read(); c != -1; c = in.read()) {
switch (c) {
case '\r':
if (peek(in) == '\n') {
in.read();
}
case '\n':
if (line.length() > 0) {
// at EOL -- check for continued line if the current
// (possibly continued) line wasn't blank
if (allowContinuedLine)
switch (peek(in)) {
case ' ' : case '\t': // line is continued
in.read();
continue;
}
}
return line.length(); // else complete
default :
line.append((char)c);
}
}
throw new EOFException();
}
private static int peek(PushbackInputStream in) throws IOException {
int value = in.read();
in.unread(value);
return value;
}
/** For debugging. */
public static void main(String[] args) throws Exception {
int timeout = -1;
boolean verbose = false;
String urlString = null;
String usage = "Usage: Http [-verbose] [-timeout N] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-timeout")) { // found -timeout option
timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-verbose")) { // found -verbose option
verbose = true;
} else if (i != args.length-1) {
System.err.println(usage);
System.exit(-1);
} else // root is required parameter
urlString = args[i];
}
Http http = new Http();
if (timeout != -1) // set timeout
http.setTimeout(timeout);
// set log level
if (verbose) {
LOG.setLevel(Level.FINE);
}
Response response = http.getResponse(new URL(urlString));
System.out.println("Code = " + response.getCode());
System.out.println("Content:");
String content = new String(response.getContent());
System.out.println(content);
}
}
--- NEW FILE: Response.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Level;
import net.nutch.fetcher.FetcherConstants;
import net.nutch.fetcher.FetcherStatus;
import net.nutch.util.GZIPUtils;
/** An HTTP response. */
public class Response {
private final Http http;
private int code;
private int numContinues;
private Map headers;
private byte[] content;
private byte[] compressedContent;
MiscHttpAccounting httpAccounting;
/** Returns the response code. */
public int getCode() { return code; }
/** Returns the value of a named header. */
public String getHeader(String name) { return (String)headers.get(name); }
/** Returns the full content of the response. */
public byte[] getContent() { return content; }
/**
* Returns the compressed version of the content if the server
* transmitted a compressed version, or <code>null</code>
* otherwise.
*/
public byte[] getCompressedContent() {
return compressedContent;
}
/**
* Returns the number of 100/Continue headers encountered
*/
public int getNumContinues() {
return numContinues;
}
Response(Http http, URL url)
throws IOException, HttpException {
this(http, url, null, null, Http.HTTP_VER_LATEST);
}
Response(Http http, URL url, InetAddress addr,
MiscHttpAccounting httpAccounting,
int httpVersion)
throws IOException, HttpException {
this.httpAccounting= httpAccounting;
this.http = http;
if (!"http".equals(url.getProtocol()))
throw new IOException("Not an HTTP url:" + url);
if ( (httpVersion < 0) || (httpVersion > Http.HTTP_VER_LATEST) )
httpVersion= Http.HTTP_VER_LATEST;
if (Http.LOG.isLoggable(Level.FINE))
Http.LOG.fine("fetching " + url);
String path = "".equals(url.getFile()) ? "/" : url.getFile();
// some servers will redirect a request with a host line like
// "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
// don't want the :80...
int port;
String portString;
if (url.getPort() == -1) {
port= 80;
portString= "";
} else {
port= url.getPort();
portString= ":" + port;
}
Socket socket = null;
try {
socket = new Socket(); // create the socket
socket.setSoTimeout(this.http.timeout);
if (addr == null) {
addr= InetAddress.getByName(url.getHost());
if (httpAccounting != null)
httpAccounting.setAddr(addr);
}
// connect
InetSocketAddress sockAddr= new InetSocketAddress(addr, port);
socket.connect(sockAddr, this.http.timeout);
OutputStream req = socket.getOutputStream(); // make request
StringBuffer reqStr = new StringBuffer("GET ");
reqStr.append(path);
if (httpVersion == Http.HTTP_VER_1_1)
reqStr.append(" HTTP/1.1\r\n");
else
reqStr.append(" HTTP/1.0\r\n");
reqStr.append("Host: ");
reqStr.append(url.getHost());
reqStr.append(portString);
reqStr.append("\r\n");
if (httpVersion == Http.HTTP_VER_1_1) {
reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
reqStr.append("Connection: close\r\n");
}
if ((this.http.agentString == null) || (this.http.agentString.length() == 0)) {
Http.LOG.severe("User-agent is not set!");
} else {
reqStr.append("User-Agent: ");
reqStr.append(this.http.agentString);
reqStr.append("\r\n");
}
reqStr.append("\r\n");
byte[] reqBytes= reqStr.toString().getBytes();
if (httpAccounting != null)
httpAccounting.incrementBytesSent(reqBytes.length);
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
new PushbackInputStream(
new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
Http.BUFFER_SIZE) ;
StringBuffer line = new StringBuffer();
numContinues= -1;
boolean haveSeenNonContinueStatus= false;
while (!haveSeenNonContinueStatus) {
numContinues++;
// parse status code line
this.code = parseStatusLine(in, line);
// parse headers
this.headers = parseHeaders(in, line);
haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
}
String transferCoding= getHeader("Transfer-Encoding");
if ("chunked".equals(transferCoding)) {
Http.LOG.fine("fetching chunked!");
try {
readChunkedContent(in, line);
} catch (EOFException e) {
throw new ChunkEOFException("");
}
} else {
Http.LOG.fine("fetching plain!");
readPlainContent(in);
}
String contentEncoding= getHeader("Content-Encoding");
if ("gzip".equals(contentEncoding)
|| "x-gzip".equals(contentEncoding)) {
Http.LOG.fine("uncompressing....");
compressedContent= content;
FetcherStatus.logTraceMisc(FetcherConstants.MISC_INFORMATIONAL,
"about to decompress: " + url);
content= GZIPUtils.unzipBestEffort(compressedContent,
this.http.maxContentLength);
if (content == null)
throw new DecompressionException("unzipBestEffort returned null");
if (Http.LOG.isLoggable(Level.FINE))
Http.LOG.fine("fetched " + compressedContent.length
+ " bytes of compressed content (expanded to "
+ content.length + " bytes) from " + url);
} else {
if (Http.LOG.isLoggable(Level.FINE))
Http.LOG.fine("fetched " + content.length + " bytes from " + url);
}
} finally {
if (socket != null)
socket.close();
}
}
private void readPlainContent(InputStream in)
throws HttpException, IOException {
int contentLength = Integer.MAX_VALUE; // get content length
String contentLengthString = (String)headers.get("Content-Length");
if (contentLengthString != null) {
contentLengthString = contentLengthString.trim();
try {
contentLength = Integer.parseInt(contentLengthString);
} catch (NumberFormatException e) {
throw new ContentLengthParseException(contentLengthString);
}
}
if (contentLength > this.http.maxContentLength) // limit download size
contentLength = this.http.maxContentLength;
ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
byte[] bytes = new byte[Http.BUFFER_SIZE];
int length = 0; // read content
for (int i = in.read(bytes); i != -1; i = in.read(bytes)) {
if (httpAccounting != null)
httpAccounting.incrementBytesRead(i);
out.write(bytes, 0, i);
length += i;
if (length >= contentLength)
break;
}
this.content = out.toByteArray();
}
private void readChunkedContent(PushbackInputStream in,
StringBuffer line)
throws HttpException, IOException {
boolean doneChunks= false;
int contentBytesRead= 0;
byte[] bytes = new byte[Http.BUFFER_SIZE];
ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
while (!doneChunks) {
Http.LOG.fine("Http: starting chunk");
Http.readLine(in, line, false);
if (httpAccounting != null)
httpAccounting.incrementBytesRead(line.length());
String chunkLenStr;
// LOG.fine("chunk-header: '" + line + "'");
int pos= line.indexOf(";");
if (pos < 0) {
chunkLenStr= line.toString();
} else {
chunkLenStr= line.substring(0, pos);
// LOG.fine("got chunk-ext: " + line.substring(pos+1));
}
chunkLenStr= chunkLenStr.trim();
int chunkLen;
try {
chunkLen= Integer.parseInt(chunkLenStr, 16);
} catch (NumberFormatException e){
throw new ContentLengthParseException(line.toString());
}
if (chunkLen == 0) {
doneChunks= true;
break;
}
if ( (contentBytesRead + chunkLen) > this.http.maxContentLength )
chunkLen= this.http.maxContentLength - contentBytesRead;
// read one chunk
int chunkBytesRead= 0;
while (chunkBytesRead < chunkLen) {
int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
(chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
int len= in.read(bytes, 0, toRead);
if (len == -1)
throw new ChunkEOFException("after " + contentBytesRead
+ " bytes in successful chunks"
+ " and " + chunkBytesRead
+ " in current chunk");
// DANGER!!! Will printed GZIPed stuff right to your
// terminal!
// LOG.fine("read: " + new String(bytes, 0, len));
if (httpAccounting != null)
httpAccounting.incrementBytesRead(len);
out.write(bytes, 0, len);
chunkBytesRead+= len;
}
Http.readLine(in, line, false);
if (httpAccounting != null)
httpAccounting.incrementBytesRead(line.length());
}
if (!doneChunks) {
if (contentBytesRead != this.http.maxContentLength)
throw new ChunkEOFException("!doneChunk && didn't max out");
return;
}
this.content= out.toByteArray();
parseHeaders(in, line);
}
private int parseStatusLine(PushbackInputStream in, StringBuffer line)
throws IOException, HttpException {
Http.readLine(in, line, false);
// approximate bytes by chars- should be right for HTTP
if (httpAccounting != null)
httpAccounting.incrementBytesRead(line.length());
int codeStart = line.indexOf(" ");
int codeEnd = line.indexOf(" ", codeStart+1);
// handle lines with no plaintext result code, ie:
// "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
if (codeEnd == -1)
codeEnd= line.length();
int code;
try {
code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
} catch (NumberFormatException e) {
throw new BadStatusLineException("bad status line '" + line
+ "': " + e.getMessage(), e);
}
int versionCode= -1;
int servVersionCode= Http.HTTP_VER_NOTSET;
try {
int httpMajorVer= 0;
int httpMinorVer= 0;
if (line.toString().startsWith("HTTP/")) {
int dotPos= line.indexOf(".");
httpMajorVer= Integer.parseInt( line.substring(5, dotPos) );
httpMinorVer= Integer.parseInt( line.substring(dotPos+1, codeStart) );
if (httpMajorVer == 1) {
if (httpMinorVer < 1)
versionCode= Http.HTTP_VER_1_0;
else
versionCode= Http.HTTP_VER_1_1;
}
}
} catch (NumberFormatException e) {
;
}
if (versionCode == Http.HTTP_VER_NOTSET) // bogus, always fall back
servVersionCode= Http.HTTP_VER_1_0;
if (httpAccounting != null) {
httpAccounting.setServHttpVersion(servVersionCode);
}
return code;
}
private void processHeaderLine(StringBuffer line, TreeMap headers)
throws IOException, HttpException {
int colonIndex = line.indexOf(":"); // key is up to colon
if (colonIndex == -1) {
int i;
for (i= 0; i < line.length(); i++)
if (!Character.isWhitespace(line.charAt(i)))
break;
if (i == line.length())
return;
throw new BadHeaderLineException("No colon in header:" + line);
}
String key = line.substring(0, colonIndex);
int valueStart = colonIndex+1; // skip whitespace
while (valueStart < line.length()) {
int c = line.charAt(valueStart);
if (c != ' ' && c != '\t')
break;
valueStart++;
}
String value = line.substring(valueStart);
headers.put(key, value);
}
private Map parseHeaders(PushbackInputStream in, StringBuffer line)
throws IOException, HttpException {
TreeMap headers = new TreeMap(String.CASE_INSENSITIVE_ORDER);
return parseHeaders(in, line, headers);
}
// Adds headers to an existing TreeMap
private Map parseHeaders(PushbackInputStream in, StringBuffer line,
TreeMap headers)
throws IOException, HttpException {
while (Http.readLine(in, line, true) != 0) {
// handle HTTP responses with missing blank line after headers
int pos;
if ( ((pos= line.indexOf("<!DOCTYPE")) != -1)
|| ((pos= line.indexOf("<HTML")) != -1)
|| ((pos= line.indexOf("<html")) != -1) ) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
// approximate bytes by chars- should be right for HTTP
if (httpAccounting != null)
httpAccounting.incrementBytesRead(pos);
try {
processHeaderLine(line, headers);
} catch (Exception e) {
// fixme:
e.printStackTrace();
}
return headers;
}
// approximate bytes by chars- should be right for HTTP
if (httpAccounting != null)
httpAccounting.incrementBytesRead(line.length());
processHeaderLine(line, headers);
}
return headers;
}
}
--- NEW FILE: DecompressionException.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
/**
* Exception indicating that some data was recieved, but could not
* be decompressed.
*/
public class DecompressionException extends HttpVersionException {
DecompressionException(String msg) {
super(msg);
}
}
--- NEW FILE: MiscHttpAccounting.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
import java.net.InetAddress;
/**
* Simple container for raw byte counts (sent and recieved) and
* HTTP version info, which can optionally be used for accounting.
* Also holds a cached InetAddress, so user can cache an address.
*/
public class MiscHttpAccounting {
private long bytesRead= 0;
private long bytesSent= 0;
private int httpVersion= Http.HTTP_VER_NOTSET;
private InetAddress addr;
public MiscHttpAccounting() {
bytesRead= 0;
bytesSent= 0;
httpVersion= Http.HTTP_VER_NOTSET;
addr= null;
}
public long getBytesRead() {
return bytesRead;
}
public void incrementBytesRead(long incr) {
bytesRead+= incr;
}
public long getBytesSent() {
return bytesSent;
}
public void incrementBytesSent(long incr) {
bytesSent+= incr;
}
public int getServHttpVersion() {
return httpVersion;
}
public void setServHttpVersion(int httpVersion) {
this.httpVersion= httpVersion;
}
public InetAddress getAddr() {
return addr;
}
public void setAddr(InetAddress addr) {
this.addr= addr;
}
public void reset() {
bytesRead= 0;
bytesSent= 0;
httpVersion= Http.HTTP_VER_NOTSET;
addr= null;
}
}
--- NEW FILE: ChunkLengthParseException.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
/**
* Exception indicating that a Chunk-Length could not be parsed
* as a hex integer.
*/
public class ChunkLengthParseException extends HttpVersionException {
ChunkLengthParseException(String msg) {
super(msg);
}
}
--- NEW FILE: HttpVersionException.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
/**
* Superclass for exceptions which indicate errors specific to
* Http versions (when these are caught, future attempts to fetch this page
* should consider falling back to an earlier protocol version).
*/
public abstract class HttpVersionException extends HttpException {
HttpVersionException(String msg) {
super(msg);
}
HttpVersionException(String msg, Throwable cause) {
super(msg, cause);
}
}
--- NEW FILE: BadStatusLineException.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
/** Exception indicating that the status line could not be parsed. */
public class BadStatusLineException extends HttpException {
BadStatusLineException(String msg, Throwable cause) {
super( msg, cause);
}
}
--- NEW FILE: ChunkEOFException.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.net.protocols.http;
/**
* Exception indicating that the server closed the socket in
* mid-chunk.
*/
public class ChunkEOFException extends HttpVersionException {
ChunkEOFException(String msg) {
super(msg);
}
}
-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs