Hi,
I dont know if this was the right thing to do, but I needed Basic Auth over Https, and when the web server returned a 401 with null content Authenticator couldnt get ant MultipleProperties..... so I've added BaseicAuth Support to Http.java, the config looks like this
<property>
  <name>http.auth.basic.username.0</name>
  <value>auser</value>
  <description>Usernames</description>
</property>
<property>
  <name>http.auth.basic.password.0</name>
  <value>apassword</value>
  <description>Passwords</description>
</property>
<property>
  <name>http.auth.basic.realm.0</name>
  <value>A Realm with spaces</value>
  <description>Realms</description>
</property>
<property>
  <name>http.auth.basic.username.1</name>
  <value>auser</value>
  <description>Usernames</description>
</property>
<property>
  <name>http.auth.basic.password.1</name>
  <value>apassword</value>
  <description>Passwords</description>
</property>
<property>
  <name>http.auth.basic.realm.1</name>
  <value>A Realm with spaces</value>
  <description>Realms</description>
</property>


If this is any good, could it go in as I want to use Nutch as a search tool inside Sakai www.sakaiproject.org, and it would be much better to use the distro rather than a hacked version.

If its no good, then I assume that you will have BasicAuth by some other means. (HttpAuthenticatorFactory et al ?)

Ian








/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package org.apache.nutch.protocol.httpclient;

import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Credentials;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NTCredentials;
import org.apache.commons.httpclient.UsernamePasswordCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.params.HttpConnectionParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.nutch.db.Page;
import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.protocol.RetryLater;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;

/** An implementation of the Http protocol. */
public class Http implements org.apache.nutch.protocol.Protocol {

  public static final Logger LOG = 
LogFormatter.getLogger("org.apache.nutch.net.Http");

  static {
    if (NutchConf.get().getBoolean("http.verbose", false)) 
LOG.setLevel(Level.FINE);
  }

  static final int BUFFER_SIZE = 8 * 1024;
  private static final int MAX_REDIRECTS = 
NutchConf.get().getInt("http.redirect.max", 3);
  private static MultiThreadedHttpConnectionManager connectionManager = new 
MultiThreadedHttpConnectionManager();
  private static HttpClient client;

  static synchronized HttpClient getClient() {
    if (client != null) return client;
    configureClient();
    return client;
  }

  static String PROXY_HOST = NutchConf.get().get("http.proxy.host");
  static int PROXY_PORT = NutchConf.get().getInt("http.proxy.port", 8080);
  static boolean PROXY = (PROXY_HOST != null && PROXY_HOST.length() > 0);
  static int TIMEOUT = NutchConf.get().getInt("http.timeout", 10000);
  static int MAX_CONTENT = NutchConf.get().getInt("http.content.limit", 64 * 
1024);
  static int MAX_DELAYS = NutchConf.get().getInt("http.max.delays", 3);
  static int MAX_THREADS_PER_HOST = 
NutchConf.get().getInt("fetcher.threads.per.host", 1);
  static int MAX_THREADS_TOTAL = 
NutchConf.get().getInt("fetcher.threads.fetch", 10);
  static String AGENT_STRING = getAgentString();
  static long SERVER_DELAY = (long) 
(NutchConf.get().getFloat("fetcher.server.delay", 1.0f) * 1000);
  static String NTLM_USERNAME = NutchConf.get().get("http.auth.ntlm.username", 
"");
  static String NTLM_PASSWORD = NutchConf.get().get("http.auth.ntlm.password", 
"");
  static String NTLM_DOMAIN = NutchConf.get().get("http.auth.ntlm.domain", "");
  static String NTLM_HOST = NutchConf.get().get("http.auth.ntlm.host", "");
  // ieb, added arrays of BASIC usernames
  static String[] BASIC_USERNAMES = null;
  static String[] BASIC_PASSWORDS = null;
  static String[] BASIC_REALMS = null;

  static {
    LOG.info("http.proxy.host = " + PROXY_HOST);
    LOG.info("http.proxy.port = " + PROXY_PORT);

    LOG.info("http.timeout = " + TIMEOUT);
    LOG.info("http.content.limit = " + MAX_CONTENT);
    LOG.info("http.agent = " + AGENT_STRING);

    LOG.info("http.auth.ntlm.username = " + NTLM_USERNAME);
    // since realms can contain spaces, we cant use the array features, instead 
we 
    // will iterate until their is a hole in the sequence.
    int nBasic = 0;
        for ( ; nBasic < 20; nBasic++) 
        {
                if ( 
NutchConf.get().get("http.auth.basic.username."+nBasic,null) == null ||
                NutchConf.get().get("http.auth.basic.password."+nBasic,null) == 
null ||
                NutchConf.get().get("http.auth.basic.realm."+nBasic,null) == 
null ) break;
        }
        BASIC_USERNAMES = new String[nBasic];
        BASIC_PASSWORDS = new String[nBasic];
        BASIC_REALMS = new String[nBasic];
        for ( int i = 0; i < nBasic; i++) 
        {
                BASIC_USERNAMES[i] = 
NutchConf.get().get("http.auth.basic.username."+i,null);
                BASIC_PASSWORDS[i] = 
NutchConf.get().get("http.auth.basic.password."+i,null);
                BASIC_REALMS[i] = 
NutchConf.get().get("http.auth.basic.realm."+i,null);
                LOG.info("http.auth.basic.username."+i+" = 
"+BASIC_USERNAMES[i]);
                LOG.info("http.auth.basic.realm."+i+" = "+BASIC_REALMS[i]);
        }
    if ( nBasic == 0)
    {
                LOG.info("no basic auth credentials specified");
    }

    LOG.info("fetcher.server.delay = " + SERVER_DELAY);
    LOG.info("http.max.delays = " + MAX_DELAYS);
  }

  /**
   * Maps from InetAddress to a Long naming the time it should be unblocked. The
   * Long is zero while the address is in use, then set to now+wait when a
   * request finishes. This way only one thread at a time accesses an address.
   */
  private static HashMap BLOCKED_ADDR_TO_TIME = new HashMap();

  /** Maps an address to the number of threads accessing that address. */
  private static HashMap THREADS_PER_HOST_COUNT = new HashMap();

  /**
   * Queue of blocked InetAddress. This contains all of the non-zero entries
   * from BLOCKED_ADDR_TO_TIME, ordered by increasing time.
   */
  private static LinkedList BLOCKED_ADDR_QUEUE = new LinkedList();

  private RobotRulesParser robotRules = new RobotRulesParser();

  private static InetAddress blockAddr(URL url) throws ProtocolException {
    InetAddress addr;
    try {
      addr = InetAddress.getByName(url.getHost());
    } catch (UnknownHostException e) {
      throw new HttpException(e);
    }

    int delays = 0;
    while (true) {
      cleanExpiredServerBlocks(); // free held addresses

      Long time;
      synchronized (BLOCKED_ADDR_TO_TIME) {
        time = (Long) BLOCKED_ADDR_TO_TIME.get(addr);
        if (time == null) { // address is free

          // get # of threads already accessing this addr
          Integer counter = (Integer) THREADS_PER_HOST_COUNT.get(addr);
          int count = (counter == null) ? 0 : counter.intValue();

          count++; // increment & store
          THREADS_PER_HOST_COUNT.put(addr, new Integer(count));

          if (count >= MAX_THREADS_PER_HOST) {
            BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
          }
          return addr;
        }
      }

      if (delays == MAX_DELAYS) throw new RetryLater(url, "Exceeded 
http.max.delays: retry later.");

      long done = time.longValue();
      long now = System.currentTimeMillis();
      long sleep = 0;
      if (done == 0) { // address is still in use
        sleep = SERVER_DELAY; // wait at least delay

      } else if (now < done) { // address is on hold
        sleep = done - now; // wait until its free
      }

      try {
        Thread.sleep(sleep);
      } catch (InterruptedException e) {}
      delays++;
    }
  }

  private static void cleanExpiredServerBlocks() {
    synchronized (BLOCKED_ADDR_TO_TIME) {
      while (!BLOCKED_ADDR_QUEUE.isEmpty()) {
        InetAddress addr = (InetAddress) BLOCKED_ADDR_QUEUE.getLast();
        long time = ((Long) BLOCKED_ADDR_TO_TIME.get(addr)).longValue();
        if (time <= System.currentTimeMillis()) {
          BLOCKED_ADDR_TO_TIME.remove(addr);
          BLOCKED_ADDR_QUEUE.removeLast();
        } else {
          break;
        }
      }
    }
  }

  private static void unblockAddr(InetAddress addr) {
    synchronized (BLOCKED_ADDR_TO_TIME) {
      int addrCount = ((Integer) THREADS_PER_HOST_COUNT.get(addr)).intValue();
      if (addrCount == 1) {
        THREADS_PER_HOST_COUNT.remove(addr);
        BLOCKED_ADDR_QUEUE.addFirst(addr);
        BLOCKED_ADDR_TO_TIME.put(addr, new Long(System.currentTimeMillis() + 
SERVER_DELAY));
      } else {
        THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount - 1));
      }
    }
  }

  public ProtocolOutput getProtocolOutput(String urlString) {
    ProtocolOutput output = null;
    try {
      return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 
1.0f), new String[0]));
    } catch (MalformedURLException mue) {
      return new ProtocolOutput(null, new ProtocolStatus(mue));
    }
  }

  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
    String urlString = fle.getUrl().toString();
    try {
      URL url = new URL(urlString);

      int redirects = 0;
      HttpAuthentication auth = null;
      while (true) {
        try {
          if (!RobotRulesParser.isAllowed(url))
                  return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
        } catch (Throwable e) {
          // XXX Maybe bogus: assume this is allowed.
          LOG.fine("Exception checking robot rules for " + url + ": " + e);
        }

        InetAddress addr = blockAddr(url);
        HttpResponse response;
        try {
          response = new HttpResponse(urlString, url); // make a request
        } finally {
          unblockAddr(addr);
        }

        int code = response.getCode();

        if (code == 200) { // got a good response
          return new ProtocolOutput(response.toContent()); // return it

        } else if (code == 410) { // page is gone
          return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));

        } else if (code >= 300 && code < 400) { // handle redirect
          url = new URL(url, response.getHeader("Location"));
          int protocolStatusCode;
          switch (code) {
            case 300:   // multiple choices, preferred value in Location
              protocolStatusCode = ProtocolStatus.MOVED;
              break;
            case 301:   // moved permanently
            case 305:   // use proxy (Location is URL of proxy)
              protocolStatusCode = ProtocolStatus.MOVED;
              break;
            case 302:   // found (temporarily moved)
            case 303:   // see other (redirect after POST)
            case 307:   // temporary redirect
              protocolStatusCode = ProtocolStatus.TEMP_MOVED;
              break;
            case 304:   // not modified
              protocolStatusCode = ProtocolStatus.NOTMODIFIED;
              break;
            default:
              protocolStatusCode = ProtocolStatus.MOVED;
          }
          // handle this in the higher layer.
          return new ProtocolOutput(null, new 
ProtocolStatus(protocolStatusCode, url));
        } else if (code == 400) { // bad request, mark as GONE
          LOG.fine("400 Bad request: " + url);
          return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.GONE, url));
        } else if (code == 401) { // requires authorization
          LOG.fine("401 Authentication Required");
          if (redirects == MAX_REDIRECTS)
                  return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED,
                          "Too many redirects: " + urlString));
          Properties p = response.toContent().getMetadata();
          if (p instanceof MultiProperties) {
            auth = 
HttpAuthenticationFactory.findAuthentication((MultiProperties) p);
          } else {
            return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authorization required: "
                    + urlString));
          }
          redirects++;
        } else if (code == 404) {
          return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.NOTFOUND, url));
        } else if (code == 410) { // permanently GONE
          return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.GONE, url));
        } else {
          return new ProtocolOutput(null, new 
ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                  + url));
        }
      }
    } catch (Throwable e) {
      e.printStackTrace();
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }

  private static String getAgentString() {
    NutchConf conf = NutchConf.get();
    String agentName = conf.get("http.agent.name");
    String agentVersion = conf.get("http.agent.version");
    String agentDesc = conf.get("http.agent.description");
    String agentURL = conf.get("http.agent.url");
    String agentEmail = conf.get("http.agent.email");

    if ((agentName == null) || (agentName.trim().length() == 0))
            LOG.severe("No User-Agent string set (http.agent.name)!");

    StringBuffer buf = new StringBuffer();

    buf.append(agentName);
    if (agentVersion != null) {
      buf.append("/");
      buf.append(agentVersion);
    }
    if (((agentDesc != null) && (agentDesc.length() != 0)) || ((agentEmail != 
null) && (agentEmail.length() != 0))
            || ((agentURL != null) && (agentURL.length() != 0))) {
      buf.append(" (");

      if ((agentDesc != null) && (agentDesc.length() != 0)) {
        buf.append(agentDesc);
        if ((agentURL != null) || (agentEmail != null)) buf.append("; ");
      }

      if ((agentURL != null) && (agentURL.length() != 0)) {
        buf.append(agentURL);
        if (agentEmail != null) buf.append("; ");
      }

      if ((agentEmail != null) && (agentEmail.length() != 0)) 
buf.append(agentEmail);

      buf.append(")");
    }
    return buf.toString();
  }

  /** For debugging. */
  public static void main(String[] args) throws Exception {
    boolean verbose = false;
    String url = null;

    String usage = "Usage: Http [-verbose] [-timeout N] url";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }

    for (int i = 0; i < args.length; i++) { // parse command line
      if (args[i].equals("-timeout")) { // found -timeout option
        TIMEOUT = Integer.parseInt(args[++i]) * 1000;
      } else if (args[i].equals("-verbose")) { // found -verbose option
        verbose = true;
      } else if (i != args.length - 1) {
        System.err.println(usage);
        System.exit(-1);
      } else // root is required parameter
      url = args[i];
    }

    Http http = new Http();

    if (verbose) {
      LOG.setLevel(Level.FINE);
    }

    ProtocolOutput out = http.getProtocolOutput(url);
    Content content = out.getContent();

    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());
      System.out.println("Content Length: " + content.get("Content-Length"));
      System.out.println("Content:");
      String text = new String(content.getContent());
      System.out.println(text);
    }

  }

  private static void configureClient() {

    // get a client isntance -- we just need one.

    client = new HttpClient(connectionManager);
    // this is just to add logging, whenever cookies are added.
    client.setState(new NutchHttpState());

    // Set up an HTTPS socket factory that accepts self-signed certs.
    Protocol dummyhttps = new Protocol("https", new 
DummySSLProtocolSocketFactory(), 443);
    Protocol.registerProtocol("https", dummyhttps);
    
    // set up the connection manager
    // hardcoded for now

    connectionManager.setMaxTotalConnections(MAX_THREADS_TOTAL);
    //if (MAX_THREADS_TOTAL > MAX_THREADS_PER_HOST) {
    //  connectionManager.setMaxConnectionsPerHost(MAX_THREADS_PER_HOST);
    //} else {
    //  connectionManager.setMaxConnectionsPerHost(MAX_THREADS_TOTAL);
    //}

    HttpConnectionParams params = connectionManager.getParams();
    params.setConnectionTimeout(TIMEOUT);
    params.setSoTimeout(TIMEOUT);
    params.setSendBufferSize(BUFFER_SIZE);
    params.setReceiveBufferSize(BUFFER_SIZE);
    HostConfiguration hostConf = client.getHostConfiguration();
    if (PROXY) {
      hostConf.setProxy(PROXY_HOST, PROXY_PORT);
    }
    if (NTLM_USERNAME.length() > 0) {
      Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD, 
NTLM_HOST, NTLM_DOMAIN);
      client.getState().setCredentials(null, null, ntCreds);

      LOG.info("Added NTLM credentials for " + NTLM_USERNAME);
    }
    // ieb put the credentials into the state.
    if ( BASIC_USERNAMES.length > 0 ) 
    {
                for( int i = 0; i < BASIC_USERNAMES.length; i++ ) {
                        Credentials basicCred = new 
UsernamePasswordCredentials(BASIC_USERNAMES[i],BASIC_PASSWORDS[i]);
                        AuthScope authScope = new 
AuthScope(AuthScope.ANY_HOST,AuthScope.ANY_PORT,BASIC_REALMS[i]);
                        client.getState().setCredentials(authScope,basicCred);
                    LOG.info("Added BASIC credentials for " + 
BASIC_USERNAMES[i]);
                }
    }
    LOG.info("Configured Client");
  }
}

class NutchHttpState extends HttpState {
  public static final Logger LOG = 
LogFormatter.getLogger("org.apache.nutch.net.Http.NutchHttpState");
  
  public void addCookie(Cookie cookie) {
    LOG.fine(" - setting cookie: " + cookie);
    super.addCookie(cookie);
  }
  
  public void addCookies(Cookie[] cookies) {
    LOG.fine(" - setting cookies: ");
    for (int i = 0; i < cookies.length; i++)
      LOG.fine("   cookie: " + cookies[i]);
    
    super.addCookies(cookies);
  }
}

Reply via email to