Fetcher is using the correct proxy but the DNS isn't getting out. Take a look at this, it might help.
http://www.rgagnon.com/javadetails/java-0085.html Dennis Kubes Damian Florczyk wrote: > ekoje ekoje napisał(a): >> Hello, I tried to modify Nutch in order to pass through a web proxy as >> advice below but it still doesn'tr work. >> >> I've got the following error: >> >> 2007-02-15 17:04:58,285 INFO fetcher.Fetcher - fetching >> http://lucene.apache.org/nutch/ >> 2007-02-15 17:04:58,300 INFO http.Http - http.proxy.host = ncproxy1 >> 2007-02-15 17:04:58,300 INFO http.Http - http.proxy.port = 8080 >> 2007-02-15 17:04:58,300 INFO http.Http - http.timeout = 10000 >> 2007-02-15 17:04:58,300 INFO http.Http - http.content.limit = 65536 >> 2007-02-15 17:04:58,300 INFO http.Http - http.agent = >> NutchCVS/Nutch-0.9-dev >> (C:\pbapps\nutch-nightly\conf\nutch-default.xml) >> 2007-02-15 17:04:58,300 INFO http.Http - >> protocol.plugin.check.blocking = true >> 2007-02-15 17:04:58,300 INFO http.Http - protocol.plugin.check.robots >> = true >> 2007-02-15 17:04:58,300 INFO http.Http - fetcher.server.delay = 1000 >> 2007-02-15 17:04:58,300 INFO http.Http - http.max.delays = 1000 >> 2007-02-15 17:04:58,316 ERROR http.Http - >> org.apache.nutch.protocol.http.api.HttpException: >> java.net.UnknownHostException: >> lucene.apache.org: lucene.apache.org >> 2007-02-15 17:04:58,316 ERROR http.Http - at >> org.apache.nutch.protocol.http.api.HttpBase.blockAddr(HttpBase.java:340) >> 2007-02-15 17:04:58,316 ERROR http.Http - at >> org.apache.nutch.protocol.http.api.HttpBase.getProtocolOutput(HttpBase.java:212) >> >> >> 2007-02-15 17:04:58,316 ERROR http.Http - at >> org.apache.nutch.fetcher.Fetcher$FetcherThread.run(Fetcher.java:145) >> 2007-02-15 17:04:58,316 ERROR http.Http - Caused by: >> java.net.UnknownHostException: >> lucene.apache.org: lucene.apache.org >> 2007-02-15 17:04:58,316 ERROR http.Http - at >> java.net.InetAddress.getAllByName0(InetAddress.java:1128) >> 2007-02-15 17:04:58,316 ERROR http.Http - at >> java.net.InetAddress.getAllByName0(InetAddress.java:1098) >> 2007-02-15 17:04:58,316 ERROR http.Http - at >> java.net.InetAddress.getAllByName(InetAddress.java:1061) >> 2007-02-15 17:04:58,316 ERROR http.Http - at >> java.net.InetAddress.getByName(InetAddress.java:958) >> 2007-02-15 17:04:58,316 ERROR http.Http - at >> org.apache.nutch.protocol.http.api.HttpBase.blockAddr(HttpBase.java:336) >> 2007-02-15 17:04:58,316 ERROR http.Http - ... 2 more >> 2007-02-15 17:04:58,316 INFO fetcher.Fetcher - fetch of >> http://lucene.apache.org/nutch/ failed with: >> org.apache.nutch.protocol.http.api.HttpException: >> java.net.UnknownHostException: >> lucene.apache.org >> : lucene.apache.org >> 2007-02-15 17:04:59,597 INFO plugin.PluginRepository - Plugins: >> looking in: >> C:\pbapps\nutch-nightly\plugins >> 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Plugin >> Auto-activation mode: >> [true] >> 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Registered >> Plugins: >> 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - >> CyberNeko HTML >> Parser (lib-nekohtml) >> 2007-02-15 17:04:59,722 INFO plugin.PluginRepository - Site >> Query Filter >> >> >> Could you please help me to go through this proxy with authentication ? >> >> Thanks, >> >> >> [-] Hi >> [-] >> [-] I was having the same problem running nutch behind a web proxy. >> [-] But with little changes in the plugin protocol-httpclient this >> works for >> [-] me. >> [-] >> [-] See source below for my changes. >> [-] >> [-] >> [-] public class Http extends HttpBase { >> [-] >> [-] public static final Log LOG = LogFactory.getLog(Http.class); >> [-] >> [-] private static MultiThreadedHttpConnectionManager >> connectionManager = >> [-] new MultiThreadedHttpConnectionManager(); >> [-] >> [-] // Since the Configuration has not yet been setted, >> [-] // then an unconfigured client is returned. >> [-] private static HttpClient client = new >> HttpClient(connectionManager); >> [-] >> [-] static synchronized HttpClient getClient() { >> [-] return client; >> [-] } >> [-] >> [-] boolean verbose = false; >> [-] int maxThreadsTotal = 10; >> [-] String ntlmUsername = ""; >> [-] String ntlmPassword = ""; >> [-] String ntlmDomain = ""; >> [-] String ntlmHost = ""; >> [-] >> [-] String proxyuser = ""; >> [-] String proxypass = ""; >> [-] >> [-] public Http() { >> [-] super(LOG); >> [-] } >> [-] >> [-] public void setConf(Configuration conf) { >> [-] super.setConf(conf); >> [-] this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10); >> [-] this.ntlmUsername = conf.get("http.auth.ntlm.username", ""); >> [-] this.ntlmPassword = conf.get("http.auth.ntlm.password", ""); >> [-] this.ntlmDomain = conf.get("http.auth.ntlm.domain", ""); >> [-] this.ntlmHost = conf.get("http.auth.ntlm.host", ""); >> [-] >> [-] >> [-] // add config for auth proxy >> [-] this.proxyuser = conf.get("http.auth.proxy.username", ""); >> [-] this.proxypass = conf.get("http.auth.proxy.password", ""); >> [-] >> [-] >> [-] //Level logLevel = Level.WARNING; >> [-] //if (conf.getBoolean("http.verbose", false)) { >> [-] // logLevel = Level.FINE; >> [-] //} >> [-] //LOG.setLevel(logLevel); >> [-] >> //Logger.getLogger("org.apache.commons.httpclient.HttpMethodDirector >> ") >> [-] // .setLevel(logLevel); >> [-] configureClient(); >> [-] } >> [-] >> [-] public static void main(String[] args) throws Exception { >> [-] Http http = new Http(); >> [-] http.setConf(NutchConfiguration.create()); >> [-] main(http, args); >> [-] } >> [-] >> [-] protected Response getResponse(URL url, CrawlDatum datum, boolean >> [-] redirect) >> [-] throws ProtocolException, IOException { >> [-] return new HttpResponse(this, url, datum, redirect); >> [-] } >> [-] >> [-] private void configureClient() { >> [-] >> [-] // Set up an HTTPS socket factory that accepts self-signed certs. >> [-] //Protocol dummyhttps = new Protocol("https", new >> [-] DummySSLProtocolSocketFactory(), 443); >> [-] //Protocol.registerProtocol("https", dummyhttps); >> [-] >> [-] HttpConnectionManagerParams params = >> connectionManager.getParams(); >> [-] params.setConnectionTimeout(timeout); >> [-] params.setSoTimeout(timeout); >> [-] params.setSendBufferSize(BUFFER_SIZE); >> [-] params.setReceiveBufferSize(BUFFER_SIZE); >> [-] params.setMaxTotalConnections(maxThreadsTotal); >> [-] if (maxThreadsTotal > maxThreadsPerHost) { >> [-] params.setDefaultMaxConnectionsPerHost(maxThreadsPerHost); >> [-] } else { >> [-] params.setDefaultMaxConnectionsPerHost(maxThreadsTotal); >> [-] } >> [-] >> [-] HostConfiguration hostConf = client.getHostConfiguration(); >> [-] ArrayList headers = new ArrayList(); >> [-] // prefer English >> [-] headers.add(new Header("Accept-Language", >> [-] "en-us,en-gb,en;q=0.7,*;q=0.3")); >> [-] // prefer UTF-8 >> [-] headers.add(new Header("Accept-Charset", >> [-] "utf-8,ISO-8859-1;q=0.7,*;q=0.7")); >> [-] // prefer understandable formats >> [-] headers.add(new Header("Accept", >> [-] >> [-] "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9 >> ,text/p >> [-] lain;q=0.8,image/png,*/*;q=0.5")); >> [-] // accept gzipped content >> [-] headers.add(new Header("Accept-Encoding", "x-gzip, gzip")); >> [-] hostConf.getParams().setParameter("http.default-headers", >> headers); >> [-] if (useProxy) { >> [-] hostConf.setProxy(proxyHost, proxyPort); >> [-] // add support for proxy authentication >> [-] if (proxyuser.length() > 0 ) { >> [-] Credentials proxyCreds = new >> [-] UsernamePasswordCredentials(proxyuser,proxypass); >> [-] client.getState().setProxyCredentials(new >> [-] AuthScope(proxyHost,AuthScope.ANY_PORT), proxyCreds); >> [-] } >> [-] } >> [-] if (ntlmUsername.length() > 0) { >> [-] Credentials ntCreds = new NTCredentials(ntlmUsername, >> ntlmPassword, >> [-] ntlmHost, ntlmDomain); >> [-] client.getState().setCredentials(new AuthScope(ntlmHost, >> [-] AuthScope.ANY_PORT), ntCreds); >> [-] >> [-] if (LOG.isInfoEnabled()) { >> [-] LOG.info("Added NTLM credentials for " + ntlmUsername); >> [-] } >> [-] } >> [-] if (LOG.isInfoEnabled()) { LOG.info("Configured Client"); } >> [-] } >> [-] } >> [-] >> [-] >> [-] -----Ursprüngliche Nachricht----- >> [-] Von: ekoje ekoje [mailto:[EMAIL PROTECTED] >> [-] Gesendet: Donnerstag, 8. Februar 2007 15:36 >> [-] An: [email protected] >> [-] Betreff: Web Proxy >> [-] >> [-] Hi Guys, >> [-] >> [-] I would like to run nutch but I'm behind a web proxy with >> authentication. >> [-] >> [-] I use nutch-0.8.1 under windows XP. Ive configured nutch-site.xml to >> [-] specify >> [-] my proxy host and port but how do i specify the username and >> password ? >> [-] >> [-] Could you please help me ? >> [-] >> [-] Thanks >> [-] >> > This exception tells you that nutch cant resolv hostname, try to pass Ip > rather then hostname of your proxy serwer (if it's possible) or make > sure that nutch can resolv hostname. > ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-general mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/nutch-general
