Hello, I tried to modify Nutch in order to pass through a web proxy as advice below but it still doesn'tr work.
I've got the following error: 2007-02-15 17:04:58,285 INFO fetcher.Fetcher - fetching http://lucene.apache.org/nutch/ 2007-02-15 17:04:58,300 INFO http.Http - http.proxy.host = ncproxy1 2007-02-15 17:04:58,300 INFO http.Http - http.proxy.port = 8080 2007-02-15 17:04:58,300 INFO http.Http - http.timeout = 10000 2007-02-15 17:04:58,300 INFO http.Http - http.content.limit = 65536 2007-02-15 17:04:58,300 INFO http.Http - http.agent = NutchCVS/Nutch-0.9-dev (C:\pbapps\nutch-nightly\conf\nutch-default.xml) 2007-02-15 17:04:58,300 INFO http.Http - protocol.plugin.check.blocking = true 2007-02-15 17:04:58,300 INFO http.Http - protocol.plugin.check.robots = true 2007-02-15 17:04:58,300 INFO http.Http - fetcher.server.delay = 1000 2007-02-15 17:04:58,300 INFO http.Http - http.max.delays = 1000 2007-02-15 17:04:58,316 ERROR http.Http - org.apache.nutch.protocol.http.api.HttpException: java.net.UnknownHostException: lucene.apache.org: lucene.apache.org 2007-02-15 17:04:58,316 ERROR http.Http - at org.apache.nutch.protocol.http.api.HttpBase.blockAddr(HttpBase.java:340) 2007-02-15 17:04:58,316 ERROR http.Http - at org.apache.nutch.protocol.http.api.HttpBase.getProtocolOutput(HttpBase.java:212) 2007-02-15 17:04:58,316 ERROR http.Http - at org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:145) 2007-02-15 17:04:58,316 ERROR http.Http - Caused by: java.net.UnknownHostException: lucene.apache.org: lucene.apache.org 2007-02-15 17:04:58,316 ERROR http.Http - at java.net.InetAddress.getAllByName0(InetAddress.java:1128) 2007-02-15 17:04:58,316 ERROR http.Http - at java.net.InetAddress.getAllByName0(InetAddress.java:1098) 2007-02-15 17:04:58,316 ERROR http.Http - at java.net.InetAddress.getAllByName(InetAddress.java:1061) 2007-02-15 17:04:58,316 ERROR http.Http - at java.net.InetAddress.getByName(InetAddress.java:958) 2007-02-15 17:04:58,316 ERROR http.Http - at org.apache.nutch.protocol.http.api.HttpBase.blockAddr(HttpBase.java:336) 2007-02-15 17:04:58,316 ERROR http.Http - ... 2 more 2007-02-15 17:04:58,316 INFO fetcher.Fetcher - fetch of http://lucene.apache.org/nutch/ failed with: org.apache.nutch.protocol.http.api.HttpException: java.net.UnknownHostException: lucene.apache.org : lucene.apache.org 2007-02-15 17:04:59,597 INFO plugin.PluginRepository - Plugins: looking in: C:\pbapps\nutch-nightly\plugins 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Plugin Auto-activation mode: [true] 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Registered Plugins: 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - CyberNeko HTML Parser (lib-nekohtml) 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Site Query Filter Could you please help me to go through this proxy with authentication ? Thanks, [-] Hi [-] [-] I was having the same problem running nutch behind a web proxy. [-] But with little changes in the plugin protocol-httpclient this works for [-] me. [-] [-] See source below for my changes. [-] [-] [-] public class Http extends HttpBase { [-] [-] public static final Log LOG = LogFactory.getLog(Http.class); [-] [-] private static MultiThreadedHttpConnectionManager connectionManager = [-] new MultiThreadedHttpConnectionManager(); [-] [-] // Since the Configuration has not yet been setted, [-] // then an unconfigured client is returned. [-] private static HttpClient client = new HttpClient(connectionManager); [-] [-] static synchronized HttpClient getClient() { [-] return client; [-] } [-] [-] boolean verbose = false; [-] int maxThreadsTotal = 10; [-] String ntlmUsername = ""; [-] String ntlmPassword = ""; [-] String ntlmDomain = ""; [-] String ntlmHost = ""; [-] [-] String proxyuser = ""; [-] String proxypass = ""; [-] [-] public Http() { [-] super(LOG); [-] } [-] [-] public void setConf(Configuration conf) { [-] super.setConf(conf); [-] this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10); [-] this.ntlmUsername = conf.get("http.auth.ntlm.username", ""); [-] this.ntlmPassword = conf.get("http.auth.ntlm.password", ""); [-] this.ntlmDomain = conf.get("http.auth.ntlm.domain", ""); [-] this.ntlmHost = conf.get("http.auth.ntlm.host", ""); [-] [-] [-] // add config for auth proxy [-] this.proxyuser = conf.get("http.auth.proxy.username", ""); [-] this.proxypass = conf.get("http.auth.proxy.password", ""); [-] [-] [-] //Level logLevel = Level.WARNING; [-] //if (conf.getBoolean("http.verbose", false)) { [-] // logLevel = Level.FINE; [-] //} [-] //LOG.setLevel(logLevel); [-] //Logger.getLogger("org.apache.commons.httpclient.HttpMethodDirector ") [-] // .setLevel(logLevel); [-] configureClient(); [-] } [-] [-] public static void main(String[] args) throws Exception { [-] Http http = new Http(); [-] http.setConf(NutchConfiguration.create()); [-] main(http, args); [-] } [-] [-] protected Response getResponse(URL url, CrawlDatum datum, boolean [-] redirect) [-] throws ProtocolException, IOException { [-] return new HttpResponse(this, url, datum, redirect); [-] } [-] [-] private void configureClient() { [-] [-] // Set up an HTTPS socket factory that accepts self-signed certs. [-] //Protocol dummyhttps = new Protocol("https", new [-] DummySSLProtocolSocketFactory(), 443); [-] //Protocol.registerProtocol("https", dummyhttps); [-] [-] HttpConnectionManagerParams params = connectionManager.getParams(); [-] params.setConnectionTimeout(timeout); [-] params.setSoTimeout(timeout); [-] params.setSendBufferSize(BUFFER_SIZE); [-] params.setReceiveBufferSize(BUFFER_SIZE); [-] params.setMaxTotalConnections(maxThreadsTotal); [-] if (maxThreadsTotal > maxThreadsPerHost) { [-] params.setDefaultMaxConnectionsPerHost(maxThreadsPerHost); [-] } else { [-] params.setDefaultMaxConnectionsPerHost(maxThreadsTotal); [-] } [-] [-] HostConfiguration hostConf = client.getHostConfiguration(); [-] ArrayList headers = new ArrayList(); [-] // prefer English [-] headers.add(new Header("Accept-Language", [-] "en-us,en-gb,en;q=0.7,*;q=0.3")); [-] // prefer UTF-8 [-] headers.add(new Header("Accept-Charset", [-] "utf-8,ISO-8859-1;q=0.7,*;q=0.7")); [-] // prefer understandable formats [-] headers.add(new Header("Accept", [-] [-] "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9 ,text/p [-] lain;q=0.8,image/png,*/*;q=0.5")); [-] // accept gzipped content [-] headers.add(new Header("Accept-Encoding", "x-gzip, gzip")); [-] hostConf.getParams().setParameter("http.default-headers", headers); [-] if (useProxy) { [-] hostConf.setProxy(proxyHost, proxyPort); [-] // add support for proxy authentication [-] if (proxyuser.length() > 0 ) { [-] Credentials proxyCreds = new [-] UsernamePasswordCredentials(proxyuser,proxypass); [-] client.getState().setProxyCredentials(new [-] AuthScope(proxyHost,AuthScope.ANY_PORT), proxyCreds); [-] } [-] } [-] if (ntlmUsername.length() > 0) { [-] Credentials ntCreds = new NTCredentials(ntlmUsername, ntlmPassword, [-] ntlmHost, ntlmDomain); [-] client.getState().setCredentials(new AuthScope(ntlmHost, [-] AuthScope.ANY_PORT), ntCreds); [-] [-] if (LOG.isInfoEnabled()) { [-] LOG.info("Added NTLM credentials for " + ntlmUsername); [-] } [-] } [-] if (LOG.isInfoEnabled()) { LOG.info("Configured Client"); } [-] } [-] } [-] [-] [-] -----Ursprüngliche Nachricht----- [-] Von: ekoje ekoje [mailto:[EMAIL PROTECTED] [-] Gesendet: Donnerstag, 8. Februar 2007 15:36 [-] An: [email protected] [-] Betreff: Web Proxy [-] [-] Hi Guys, [-] [-] I would like to run nutch but I'm behind a web proxy with authentication. [-] [-] I use nutch-0.8.1 under windows XP. Ive configured nutch-site.xml to [-] specify [-] my proxy host and port but how do i specify the username and password ? [-] [-] Could you please help me ? [-] [-] Thanks [-]
