net RegexURLFilter.java,NONE,1.1 URLFilter.java,NONE,1.1 Http.java,NONE,1.1 URLFilterFactory.java,NONE,1.1 UrlNormalizer.java,NONE,1.1

joa23 Thu, 29 Jan 2004 07:58:53 -0800

Update of /cvsroot/nutch/playground/src/java/net/nutch/net
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10313/src/java/net/nutch/net


Added Files:
        RegexURLFilter.java URLFilter.java Http.java 
        URLFilterFactory.java UrlNormalizer.java 
Log Message:
intial commit

--- NEW FILE: RegexURLFilter.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.net;

import java.io.Reader;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;

import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Pattern;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.MalformedPatternException;

import net.nutch.util.*;

/** Filters URLs based on a file of regular expressions.  The config file is
 * named by the Nutch configuration property "urlfilter.regex.file".
 *
 * <p>The format of this file is:
 * <pre>
 * [+-]<regex>
 * </pre>
 * where plus means go ahead and index it and minus means no.
 */

public class RegexURLFilter implements URLFilter {

  private static class Rule {
    public Perl5Pattern pattern;
    public boolean sign;
    public String regex;        
  }

  private List rules;
  private PatternMatcher matcher = new Perl5Matcher();

  public RegexURLFilter() throws IOException, MalformedPatternException {
    String file = NutchConf.get("urlfilter.regex.file");
    Reader reader = NutchConf.getConfResourceAsReader(file);
    rules=readConfigurationFile(reader);
  }

  public RegexURLFilter(String filename)
    throws IOException, MalformedPatternException {
    rules = readConfigurationFile(new FileReader(filename));
  }

  public synchronized String filter(String url) {
    Iterator i=rules.iterator();
    while(i.hasNext()) {
      Rule r=(Rule) i.next();
      if (matcher.contains(url,r.pattern)) {
        //System.out.println("Matched " + r.regex);
        return r.sign ? url : null;
      }
    };
        
    return null;   // assume no go
  }

  //
  // Format of configuration file is
  //    
  // [+-]<regex>
  //
  // where plus means go ahead and index it and minus means no.
  // 

  private static List readConfigurationFile(Reader reader)
    throws IOException, MalformedPatternException {

    BufferedReader in=new BufferedReader(reader);
    Perl5Compiler compiler=new Perl5Compiler();
    List rules=new ArrayList();
    String line;
       
    while((line=in.readLine())!=null) {
      if (line.length() == 0)
        continue;
      char first=line.charAt(0);
      boolean sign=false;
      switch (first) {
      case '+' : 
        sign=true;
        break;
      case '-' :
        sign=false;
        break;
      case ' ' : case '\n' : case '#' :           // skip blank & comment lines
        continue;
      default :
        throw new IOException("Invalid first character: "+line);
      }

      String regex=line.substring(1);

      Rule rule=new Rule();
      rule.pattern=(Perl5Pattern) compiler.compile(regex);
      rule.sign=sign;
      rule.regex=regex;
      rules.add(rule);
    }

    return rules;
  }

  public static void main(String args[])
    throws IOException, MalformedPatternException {

    RegexURLFilter filter=new RegexURLFilter();
    BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
    String line;
    while((line=in.readLine())!=null) {
      String out=filter.filter(line);
      if(out!=null) {
        System.out.print("+");
        System.out.println(out);
      } else {
        System.out.print("-");
        System.out.println(line);
      }
    }
  }

}

--- NEW FILE: URLFilter.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.net;

/** Interface used to limit which URLs enter Nutch.  Used by the injector and
 * the db updater.*/

public interface URLFilter {

  /* Interface for a filter that transforms a URL: it can pass the
     original URL through or "delete" the URL by returning null */
  public String filter(String url);

}

--- NEW FILE: Http.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.net;

import java.io.IOException;
import java.io.EOFException;
import java.io.BufferedInputStream;
import java.io.PushbackInputStream;
import java.io.InputStream;
import java.io.ByteArrayOutputStream;
import java.io.OutputStream;

import java.net.URL;
import java.net.Socket;
import java.net.InetAddress;
import java.net.InetSocketAddress;

import java.util.Map;
import java.util.TreeMap;

import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.logging.Handler;

import net.nutch.util.LogFormatter;
import net.nutch.util.GZIPUtils;
import net.nutch.util.NutchConf;
import net.nutch.fetcher.FetcherStatus;
import net.nutch.fetcher.FetcherConstants;

/** A simple HTTP client. */
public class Http {

  public static final Logger LOG =
    LogFormatter.getLogger("net.nutch.net.Http");

  private static final int DEFAULT_PORT = 80;
  private static final int CODE_OK = 200;
  private static final int BUFFER_SIZE = 16384;
  private static final int MAX_REDIRECTS = 5;

  /** Reserved value for HTTP version number, does not denote any version */
  public static final int HTTP_VER_NOTSET= -1;
  /** HTTP version 1.0 (the earliest version we use) */
  public static final int HTTP_VER_1_0= 0;
  /** HTTP version 1.1 */
  public static final int HTTP_VER_1_1= 1;
  /** Always indicates the latest HTTP version we support, currently 1.1 */
  public static final int HTTP_VER_LATEST;

  private int timeout = NutchConf.getInt("http.timeout", 10000);
  private int maxContentLength= NutchConf.getInt("http.content.limit",64*1024);

  private String agentString= null;
  private String agentEmail = NutchConf.get("http.agent.email");

  static {
    if (NutchConf.getBoolean("http.version.1.1", true)) 
      HTTP_VER_LATEST= HTTP_VER_1_1;
    else 
      HTTP_VER_LATEST= HTTP_VER_1_0;
  }

  /**
   *  Returns the HTTP version code which represents a lesser version
   *  of HTTP, or HTTP_VER_NOTSET if both equal that value.
   */
  public static int minHttpVersion(int ver1, int ver2) {
    if (ver1 < ver2) {
      if (ver1 == HTTP_VER_NOTSET) 
        return ver2;
      return ver1;
    }
    if (ver2 == HTTP_VER_NOTSET) 
      return ver1;
    return ver2;
  }

  /** Superclass for non-IO exceptions thrown during HTTP requests
   *  or parsing of responses
   */
  public abstract class HttpException extends Exception {
    HttpException(String msg) {
      super(msg);
    }

    HttpException(String msg, Throwable cause) {
      super(msg, cause);
    }
  }

  /** Exception indicating that the status line could not be parsed. */
  public class BadStatusLineException extends HttpException {
    BadStatusLineException(String msg, Throwable cause) {
      super(msg, cause);
    }
  }

  /** Exception indicating that a header line could not be parsed. */
  public class BadHeaderLineException extends HttpException {
    BadHeaderLineException(String msg) {
      super(msg);
    }
  }

  /** 
   * Exception indicating that the Content-Length header could not be
   * parsed.
   */
  public class ContentLengthParseException extends HttpException {
    ContentLengthParseException(String msg) {
      super(msg);
    }
  }

  /** 
   * Superclass for exceptions which indicate errors specific to
   * HTTP/1.1 (when these are caught, future attempts to fetch this page
   * should consider falling back to 1.0).
   */
  public abstract class Http11Exception extends HttpException {
    Http11Exception(String msg) {
      super(msg);
    }

    Http11Exception(String msg, Throwable cause) {
      super(msg, cause);
    }
  }

  /** 
   * Exception indicating that a Chunk-Length could not be parsed
   * as a hex integer.
   */ 
  public class ChunkLengthParseException extends Http11Exception {
    ChunkLengthParseException(String msg) {
      super(msg);
    }
  }

  /** 
   * Exception indicating that the server closed the socket in
   * mid-chunk.
   */ 
  public class ChunkEOFException extends Http11Exception {
    ChunkEOFException(String msg) {
      super(msg);
    }
  }

  /** 
   * Exception indicating that some data was recieved, but could not
   * be decompressed.
   */ 
  public class DecompressionException extends Http11Exception {
    DecompressionException(String msg) {
      super(msg);
    }
  }


  /**
   *  Simple container for raw byte counts (sent and recieved) and
   *  HTTP version info, which can optionally be used for accounting.
   *  Also holds a cached InetAddress, so user can cache an address.
   */
  public static class MiscHttpAccounting {
    private long bytesRead= 0;
    private long bytesSent= 0;
    private int httpVersion= HTTP_VER_NOTSET;
    private InetAddress addr;

    public MiscHttpAccounting() {
      bytesRead= 0;
      bytesSent= 0;
      httpVersion= HTTP_VER_NOTSET;
      addr= null;
    }

    public long getBytesRead() {
      return bytesRead;
    }

    public void incrementBytesRead(long incr) {
      bytesRead+= incr;
    }

    public long getBytesSent() {
      return bytesSent;
    }

    public void incrementBytesSent(long incr) {
      bytesSent+= incr;
    }

    public int getServHttpVersion() {
      return httpVersion;
    }

    public void setServHttpVersion(int httpVersion) {
      this.httpVersion= httpVersion;
    }

    public InetAddress getAddr() {
      return addr;
    }

    public void setAddr(InetAddress addr) {
      this.addr= addr;
    }

    public void reset() {
      bytesRead= 0;
      bytesSent= 0;
      httpVersion= HTTP_VER_NOTSET;
      addr= null;
    }

  }

  /** An HTTP response. */
  public class Response {
    private int code;
    private int numContinues;
    private Map headers;
    private byte[] content;
    private byte[] compressedContent;
    MiscHttpAccounting httpAccounting;

    /** Returns the response code. */
    public int getCode() { return code; }

    /** Returns the value of a named header. */
    public String getHeader(String name) { return (String)headers.get(name); }

    /** Returns the full content of the response. */
    public byte[] getContent() { return content; }

    /** 
     * Returns the compressed version of the content if the server
     * transmitted a compressed version, or <code>null</code>
     * otherwise. 
     */
    public byte[] getCompressedContent() { 
      return compressedContent; 
    }

    /**
     * Returns the number of 100/Continue headers encountered 
     */
    public int getNumContinues() {
      return numContinues;
    }

    private Response(URL url) 
      throws IOException, HttpException {
      this(url, null, null, HTTP_VER_LATEST);
    }

    private Response(URL url, InetAddress addr,
                     MiscHttpAccounting httpAccounting,
                     int httpVersion) 
      throws IOException, HttpException {

      this.httpAccounting= httpAccounting;

      if (!"http".equals(url.getProtocol()))
        throw new IOException("Not an HTTP url:" + url);

      if ( (httpVersion < 0) || (httpVersion > HTTP_VER_LATEST) )
        httpVersion= HTTP_VER_LATEST;

      if (LOG.isLoggable(Level.FINE))
        LOG.fine("fetching " + url);

      String path = "".equals(url.getFile()) ? "/" : url.getFile();

      // some servers will redirect a request with a host line like
      // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
      // don't want the :80...
      int port;
      String portString;
      if (url.getPort() == -1) {
        port= 80;
        portString= "";
      } else {
        port= url.getPort();
        portString= ":" + port;
      }

      Socket socket = null;

      try {
        socket = new Socket();                    // create the socket
        socket.setSoTimeout(timeout);

        if (addr == null) {
          addr= InetAddress.getByName(url.getHost());
          if (httpAccounting != null) 
            httpAccounting.setAddr(addr);
        }

        // connect
        InetSocketAddress sockAddr= new InetSocketAddress(addr, port);
        socket.connect(sockAddr, timeout);

        OutputStream req = socket.getOutputStream(); // make request

        StringBuffer reqStr = new StringBuffer("GET ");
        reqStr.append(path);


        if (httpVersion == HTTP_VER_1_1)
          reqStr.append(" HTTP/1.1\r\n");
        else 
          reqStr.append(" HTTP/1.0\r\n");

        reqStr.append("Host: ");
        reqStr.append(url.getHost());
        reqStr.append(portString);
        reqStr.append("\r\n");

        if (httpVersion == HTTP_VER_1_1) {
          reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
          reqStr.append("Connection: close\r\n");
        }

        if ((agentString == null) || (agentString.length() == 0)) {
          LOG.severe("User-agent is not set!");
        } else {
          reqStr.append("User-Agent: ");
          reqStr.append(agentString);
          reqStr.append("\r\n");
        }

        reqStr.append("\r\n");
        byte[] reqBytes= reqStr.toString().getBytes();

        if (httpAccounting != null) 
          httpAccounting.incrementBytesSent(reqBytes.length);

        req.write(reqBytes);
        req.flush();
        
        PushbackInputStream in =                  // process response
          new PushbackInputStream(
            new BufferedInputStream(socket.getInputStream(), BUFFER_SIZE), 
            BUFFER_SIZE) ;

        StringBuffer line = new StringBuffer();

        numContinues= -1;
        boolean haveSeenNonContinueStatus= false;
        while (!haveSeenNonContinueStatus) {
          numContinues++;
          // parse status code line
          this.code = parseStatusLine(in, line); 
          // parse headers
          this.headers = parseHeaders(in, line); 
          haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
        }

        String transferCoding= getHeader("Transfer-Encoding");
        if ("chunked".equals(transferCoding)) {
          LOG.fine("fetching chunked!");
          try {
            readChunkedContent(in, line);
          } catch (EOFException e) {
            throw new ChunkEOFException("");
          }
        } else {
          LOG.fine("fetching plain!");
          readPlainContent(in);
        }

        String contentEncoding= getHeader("Content-Encoding");
        if ("gzip".equals(contentEncoding)
          || "x-gzip".equals(contentEncoding)) {
          LOG.fine("uncompressing....");
          compressedContent= content;
          FetcherStatus.logTraceMisc(FetcherConstants.MISC_INFORMATIONAL, 
                                     "about to decompress: " + url);

          content= GZIPUtils.unzipBestEffort(compressedContent, 
                                             maxContentLength);
          if (content == null)
            throw new DecompressionException("unzipBestEffort returned null");

          if (LOG.isLoggable(Level.FINE))
            LOG.fine("fetched " + compressedContent.length
                     + " bytes of compressed content (expanded to "
                     + content.length + " bytes) from " + url);
        } else {
          if (LOG.isLoggable(Level.FINE))
            LOG.fine("fetched " + content.length + " bytes from " + url);
        }

      } finally {
        if (socket != null)
          socket.close();
      }

    }

    private void readPlainContent(InputStream in) 
      throws HttpException, IOException {

      int contentLength = Integer.MAX_VALUE;    // get content length
      String contentLengthString = (String)headers.get("Content-Length");
      if (contentLengthString != null) {
        contentLengthString = contentLengthString.trim();
        try {
          contentLength = Integer.parseInt(contentLengthString);
        } catch (NumberFormatException e) {
          throw new ContentLengthParseException(contentLengthString);
        }
      }
      if (contentLength > maxContentLength)   // limit download size
        contentLength  = maxContentLength;

      ByteArrayOutputStream out = new ByteArrayOutputStream(BUFFER_SIZE);
      byte[] bytes = new byte[BUFFER_SIZE];
      int length = 0;                           // read content
      for (int i = in.read(bytes); i != -1; i = in.read(bytes)) {

        if (httpAccounting != null) 
          httpAccounting.incrementBytesRead(i);

        out.write(bytes, 0, i);
        length += i;
        if (length >= contentLength)
          break;
      }
      this.content = out.toByteArray();
    }

    private void readChunkedContent(PushbackInputStream in,  
                                    StringBuffer line) 
    throws HttpException, IOException {
      boolean doneChunks= false;
      int contentBytesRead= 0;
      byte[] bytes = new byte[BUFFER_SIZE];
      ByteArrayOutputStream out = new ByteArrayOutputStream(BUFFER_SIZE);

      while (!doneChunks) {
        LOG.fine("Http: starting chunk");

        readLine(in, line, false);

        if (httpAccounting != null) 
          httpAccounting.incrementBytesRead(line.length());

        String chunkLenStr;
        // LOG.fine("chunk-header: '" + line + "'");

        int pos= line.indexOf(";");
        if (pos < 0) {
          chunkLenStr= line.toString();
        } else {
          chunkLenStr= line.substring(0, pos);
          // LOG.fine("got chunk-ext: " + line.substring(pos+1));
        }
        chunkLenStr= chunkLenStr.trim();
        int chunkLen;
        try {
          chunkLen= Integer.parseInt(chunkLenStr, 16);
        } catch (NumberFormatException e){ 
          throw new ContentLengthParseException(line.toString());
        }

        if (chunkLen == 0) {
          doneChunks= true;
          break;
        }

        if ( (contentBytesRead + chunkLen) > maxContentLength )
          chunkLen= maxContentLength - contentBytesRead;

        // read one chunk
        int chunkBytesRead= 0;
        while (chunkBytesRead < chunkLen) {

          int toRead= (chunkLen - chunkBytesRead) < BUFFER_SIZE ?
            (chunkLen - chunkBytesRead) : BUFFER_SIZE;
          int len= in.read(bytes, 0, toRead);

          if (len == -1) 
            throw new ChunkEOFException("after " + contentBytesRead
                                        + " bytes in successful chunks"
                                        + " and " + chunkBytesRead 
                                        + " in current chunk");

          // DANGER!!! Will printed GZIPed stuff right to your
          // terminal!
          // LOG.fine("read: " +  new String(bytes, 0, len));

          if (httpAccounting != null) 
            httpAccounting.incrementBytesRead(len);

          out.write(bytes, 0, len);
          chunkBytesRead+= len;  
        }

        readLine(in, line, false);

        if (httpAccounting != null) 
          httpAccounting.incrementBytesRead(line.length());

      }

      if (!doneChunks) {
        if (contentBytesRead != maxContentLength) 
          throw new ChunkEOFException("!doneChunk && didn't max out");
        return;
      }

      this.content= out.toByteArray();
      parseHeaders(in, line);

    }

    private int parseStatusLine(PushbackInputStream in, StringBuffer line)
      throws IOException, HttpException {
      readLine(in, line, false);

      // approximate bytes by chars- should be right for HTTP
      if (httpAccounting != null) 
        httpAccounting.incrementBytesRead(line.length());

      int codeStart = line.indexOf(" ");
      int codeEnd = line.indexOf(" ", codeStart+1);

      // handle lines with no plaintext result code, ie:
      // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
      if (codeEnd == -1) 
        codeEnd= line.length();

      int code;
      try {
        code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
      } catch (NumberFormatException e) {
        throw new BadStatusLineException("bad status line '" + line 
                                         + "': " + e.getMessage(), e);
      }

      int versionCode= -1;
      int servVersionCode= HTTP_VER_NOTSET;
      try {
        int httpMajorVer= 0;
        int httpMinorVer= 0;
        
        if (line.toString().startsWith("HTTP/")) {
          int dotPos= line.indexOf(".");
          httpMajorVer= Integer.parseInt( line.substring(5, dotPos) );
          httpMinorVer= Integer.parseInt( line.substring(dotPos+1, codeStart) );

          if (httpMajorVer == 1) {
            if (httpMinorVer < 1) 
              versionCode= HTTP_VER_1_0;
            else 
              versionCode= HTTP_VER_1_1;
          }
        }

      } catch (NumberFormatException e) {
        ;
      }

      if (versionCode == HTTP_VER_NOTSET) // bogus, always fall back
        servVersionCode= HTTP_VER_1_0;

      if (httpAccounting != null) {
        httpAccounting.setServHttpVersion(servVersionCode);
      }

      return code;
    }


    private void processHeaderLine(StringBuffer line, TreeMap headers)
      throws IOException, HttpException {
      int colonIndex = line.indexOf(":");       // key is up to colon
      if (colonIndex == -1) {
        int i;
        for (i= 0; i < line.length(); i++)
          if (!Character.isWhitespace(line.charAt(i)))
            break;
        if (i == line.length())
          return;
        throw new BadHeaderLineException("No colon in header:" + line);
      }
      String key = line.substring(0, colonIndex);

      int valueStart = colonIndex+1;            // skip whitespace
      while (valueStart < line.length()) {
        int c = line.charAt(valueStart);
        if (c != ' ' && c != '\t')
          break;
        valueStart++;
      }
      String value = line.substring(valueStart);

      headers.put(key, value);
    }

    private Map parseHeaders(PushbackInputStream in, StringBuffer line)
      throws IOException, HttpException {
      TreeMap headers = new TreeMap(String.CASE_INSENSITIVE_ORDER);
      return parseHeaders(in, line, headers);
    }

    // Adds headers to an existing TreeMap
    private Map parseHeaders(PushbackInputStream in, StringBuffer line,
                             TreeMap headers)
      throws IOException, HttpException {
      while (readLine(in, line, true) != 0) {

        // handle HTTP responses with missing blank line after headers
        int pos;
        if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) 
             || ((pos= line.indexOf("<HTML")) != -1) 
             || ((pos= line.indexOf("<html")) != -1) ) {

          in.unread(line.substring(pos).getBytes("UTF-8"));
          line.setLength(pos);

          // approximate bytes by chars- should be right for HTTP
          if (httpAccounting != null) 
            httpAccounting.incrementBytesRead(pos);

          try {
            processHeaderLine(line, headers);
          } catch (Exception e) {
            // fixme:
            e.printStackTrace();
          }

          return headers;
        }

        // approximate bytes by chars- should be right for HTTP
        if (httpAccounting != null) 
          httpAccounting.incrementBytesRead(line.length());

        processHeaderLine(line, headers);
      }
      return headers;
    }
  }

  /** Set the timeout. */
  public void setTimeout(int timeout) {this.timeout = timeout;}

  /** Set the point at which content is truncated. */
  public void setMaxContentLength(int length) {this.maxContentLength = length;}

  /** Set the agent name */
  public void setAgentString(String agentString) {
    this.agentString = agentString;
  }

  /** set the return email address */
  public void setAgentEmail(String agentEmail) {this.agentEmail = agentEmail;}

  /** 
   * Make a single HTTP request and return its response, not following
   * redirects and not translating HTTP errors to exceptions.  If
   * <code>addr</code> is not null, that address will be used.  If
   * <code>httpAccounting</code> is not <code>null</code>, the it's
   * fields will be upated during this request.  The request will be issued 
   * using the HTTP version specified by <code>httpVersion</code>.
   */
  public Response getRawResponse(URL url, InetAddress addr,
                                 MiscHttpAccounting httpAccounting,
                                 int httpVersion)
    throws IOException, HttpException {
    return new Response(url, addr, httpAccounting, httpVersion);
  }

  /** Returns the content of a URL.  Follow redirects and translate HTTP errors
   * to exceptions. */
  public Response getResponse(URL url) throws IOException, HttpException {

    int redirects = 0;
    URL target = url;

    while (true) {
      Response response = new Response(target);   // make a request

      int code = response.getCode();

      if (code == 200) {                          // got a good response
        return response;                          // return it

      } else if (code >= 300 && code < 400) {     // handle redirect
        if (redirects == MAX_REDIRECTS)
          throw new IOException("Too many redirects: " + url);
        target = new URL(response.getHeader("Location"));
        redirects++;                
        LOG.fine("redirect to " + target); 

      } else {                                    // convert to exception
        throw new IOException("HTTP Error: " + code);
      }
    } 
  }

  private static int readLine(PushbackInputStream in, StringBuffer line,
                              boolean allowContinuedLine)
    throws IOException {
    line.setLength(0);
    for (int c = in.read(); c != -1; c = in.read()) {
      switch (c) {
      case '\r':
        if (peek(in) == '\n') {
          in.read();
        }
      case '\n': 
        if (line.length() > 0) {
          // at EOL -- check for continued line if the current
          // (possibly continued) line wasn't blank
          if (allowContinuedLine) 
            switch (peek(in)) {
            case ' ' : case '\t':                   // line is continued
              in.read();
              continue;
            }
        }
        return line.length();      // else complete
      default :
        line.append((char)c);
      }
    }
    throw new EOFException();
  }

  private static int peek(PushbackInputStream in) throws IOException {
    int value = in.read();
    in.unread(value);
    return value;
  }

  /** For debugging. */
  public static void main(String[] args) throws Exception {
    int timeout = -1;
    boolean verbose = false;
    String urlString = null;

    String usage = "Usage: Http [-verbose] [-timeout N] url";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }
      

    for (int i = 0; i < args.length; i++) {       // parse command line
      if (args[i].equals("-timeout")) {           // found -timeout option
        timeout = Integer.parseInt(args[++i]) * 1000;
      } else if (args[i].equals("-verbose")) {    // found -verbose option
        verbose = true;
      } else if (i != args.length-1) {
        System.err.println(usage);
        System.exit(-1);
      } else                                      // root is required parameter
        urlString = args[i];
    }

    Http http = new Http();

    if (timeout != -1)                            // set timeout
      http.setTimeout(timeout);
    // set log level
    if (verbose) {
      LOG.setLevel(Level.FINE);
    }

    Response response = http.getResponse(new URL(urlString));

    System.out.println("Code = " + response.getCode());
    System.out.println("Content:");
    
    String content = new String(response.getContent());

    System.out.println(content);
  }

}

--- NEW FILE: URLFilterFactory.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.net;

import net.nutch.util.*;
import java.util.logging.*;

/** Factory to create a URLFilter from "urlfilter.class" config property. */
public class URLFilterFactory {
  private static final Logger LOG =
    LogFormatter.getLogger("net.nutch.net.URLFilterFactory");

  private static final String URLFILTER_CLASS =
    NutchConf.get("urlfilter.class");

  private URLFilterFactory() {}                   // no public ctor

  private static URLFilter filter;

  /** Return the default URLFilter implementation. */
  public static URLFilter getFilter() {

    if (filter == null) {
      try {
        LOG.info("Using URL filter: " + URLFILTER_CLASS);
        Class filterClass = Class.forName(URLFILTER_CLASS);
        filter = (URLFilter)filterClass.newInstance();
      } catch (Exception e) {
        throw new RuntimeException("Couldn't create "+URLFILTER_CLASS, e);
      }
    }

    return filter;

  }

}

--- NEW FILE: UrlNormalizer.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.net;

import java.net.URL;
import java.net.MalformedURLException;
// import java.net.URI;
// import java.net.URISyntaxException;

import java.util.logging.Logger;
import net.nutch.util.LogFormatter;

/** Converts URLs to a normal form . */
public class UrlNormalizer {
  public static final Logger LOG =
    LogFormatter.getLogger("net.nutch.net.UrlNormalizer");

  public static String normalize(String urlString)
    throws MalformedURLException {

    if ("".equals(urlString))                     // permit empty
      return urlString;

    urlString = urlString.trim();                 // remove extra spaces

    URL url = new URL(urlString);

    String protocol = url.getProtocol();
    String host = url.getHost();
    int port = url.getPort();
    String file = url.getFile();

    boolean changed = false;

    if (!urlString.startsWith(protocol))        // protocol was lowercased
      changed = true;

    if ("http".equals(protocol) || "ftp".equals(protocol)) {
      
      if (host != null) {
        String newHost = host.toLowerCase();    // lowercase host
        if (!host.equals(newHost)) {
          host = newHost;
          changed = true;
        }
      }

      if (port == url.getDefaultPort()) {       // uses default port
        port = -1;                              // so don't specify it
        changed = true;
      }

      if (file == null || "".equals(file)) {    // add a slash
        file = "/";
        changed = true;
      }

    }

    if (changed)
      urlString = new URL(protocol, host, port, file).toString();

    return urlString;
  }

}



-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] playground/src/java/net/nutch/net RegexURLFilter.java,NONE,1.1 URLFilter.java,NONE,1.1 Http.java,NONE,1.1 URLFilterFactory.java,NONE,1.1 UrlNormalizer.java,NONE,1.1

Reply via email to