http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java deleted file mode 100644 index 0cc2de5..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java +++ /dev/null @@ -1,199 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.httpclient; - -// JDK imports -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -// Commons Codec imports -import org.apache.commons.codec.binary.Base64; - -// Commons Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configurable; - -/** - * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are - * stored in standard Nutch configuration files using the following properties: - * http.auth.basic.<realm>.user http.auth.basic.<realm>.pass - * - * @author Matt Tencati - */ -public class HttpBasicAuthentication implements HttpAuthentication, - Configurable { - - public static final Logger LOG = LoggerFactory - .getLogger(HttpBasicAuthentication.class); - - private static Pattern basic = Pattern - .compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\""); - - private static Map<String, HttpBasicAuthentication> authMap = new TreeMap<String, HttpBasicAuthentication>(); - - private Configuration conf = null; - private String challenge = null; - private ArrayList<String> credentials = null; - private String realm = null; - - /** - * Construct an HttpBasicAuthentication for the given challenge parameters. - * The challenge parameters are returned by the web server using a - * WWW-Authenticate header. This will typically be represented by single line - * of the form <code>WWW-Authenticate: Basic realm="myrealm"</code> - * - * @param challenge - * WWW-Authenticate header from web server - */ - protected HttpBasicAuthentication(String challenge, Configuration conf) - throws HttpAuthenticationException { - - setConf(conf); - this.challenge = challenge; - credentials = new ArrayList<String>(); - - String username = this.conf.get("http.auth.basic." + challenge + ".user"); - String password = this.conf.get("http.auth.basic." + challenge - + ".password"); - - if (LOG.isTraceEnabled()) { - LOG.trace("BasicAuthentication challenge is " + challenge); - LOG.trace("BasicAuthentication username=" + username); - LOG.trace("BasicAuthentication password=" + password); - } - - if (username == null) { - throw new HttpAuthenticationException("Username for " + challenge - + " is null"); - } - - if (password == null) { - throw new HttpAuthenticationException("Password for " + challenge - + " is null"); - } - - byte[] credBytes = (username + ":" + password).getBytes(); - credentials.add("Authorization: Basic " - + new String(Base64.encodeBase64(credBytes))); - if (LOG.isTraceEnabled()) { - LOG.trace("Basic credentials: " + credentials); - } - } - - /* - * ---------------------------------- * <implementation:Configurable> * - * ---------------------------------- - */ - - public void setConf(Configuration conf) { - this.conf = conf; - // if (conf.getBoolean("http.auth.verbose", false)) { - // LOG.setLevel(Level.FINE); - // } else { - // LOG.setLevel(Level.WARNING); - // } - } - - public Configuration getConf() { - return this.conf; - } - - /* - * ---------------------------------- * <implementation:Configurable> * - * ---------------------------------- - */ - - /** - * Gets the Basic credentials generated by this HttpBasicAuthentication object - * - * @return Credentials in the form of - * <code>Authorization: Basic <Base64 encoded userid:password> - * - */ - public List<String> getCredentials() { - return credentials; - } - - /** - * Gets the realm attribute of the HttpBasicAuthentication object. This should - * have been supplied to the {@link #getAuthentication(String, Configuration)} - * static method - * - * @return The realm - */ - public String getRealm() { - return realm; - } - - /** - * This method is responsible for providing Basic authentication information. - * The method caches authentication information for each realm so that the - * required authentication information does not need to be regenerated for - * every request. - * - * @param challenge - * The challenge string provided by the webserver. This is the text - * which follows the WWW-Authenticate header, including the Basic - * tag. - * @return An HttpBasicAuthentication object or null if unable to generate - * appropriate credentials. - */ - public static HttpBasicAuthentication getAuthentication(String challenge, - Configuration conf) { - if (challenge == null) - return null; - Matcher basicMatcher = basic.matcher(challenge); - if (basicMatcher.matches()) { - String realm = basicMatcher.group(1); - Object auth = authMap.get(realm); - if (auth == null) { - HttpBasicAuthentication newAuth = null; - try { - newAuth = new HttpBasicAuthentication(realm, conf); - } catch (HttpAuthenticationException hae) { - if (LOG.isTraceEnabled()) { - LOG.trace("HttpBasicAuthentication failed for " + challenge); - } - } - authMap.put(realm, newAuth); - return newAuth; - } else { - return (HttpBasicAuthentication) auth; - } - } - return null; - } - - /** - * Provides a pattern which can be used by an outside resource to determine if - * this class can provide credentials based on simple header information. It - * does not calculate any information regarding realms or challenges. - * - * @return Returns a Pattern which will match a Basic WWW-Authenticate header. - */ - public static final Pattern getBasicPattern() { - return basic; - } -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java deleted file mode 100644 index b713ab6..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.httpclient; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -public class HttpFormAuthConfigurer { - private String loginUrl; - private String loginFormId; - /** - * The data posted to login form, such as username(or email), password - */ - private Map<String, String> loginPostData; - /** - * In case we need add additional headers. - */ - private Map<String, String> additionalPostHeaders; - /** - * If http post login returns redirect code: 301 or 302, - * Http Client will automatically follow the redirect. - */ - private boolean loginRedirect; - /** - * Used when we need remove some form fields. - */ - private Set<String> removedFormFields; - - public HttpFormAuthConfigurer() { - } - - public String getLoginUrl() { - return loginUrl; - } - - public HttpFormAuthConfigurer setLoginUrl(String loginUrl) { - this.loginUrl = loginUrl; - return this; - } - - public String getLoginFormId() { - return loginFormId; - } - - public HttpFormAuthConfigurer setLoginFormId(String loginForm) { - this.loginFormId = loginForm; - return this; - } - - public Map<String, String> getLoginPostData() { - return loginPostData == null ? new HashMap<String, String>() - : loginPostData; - } - - public HttpFormAuthConfigurer setLoginPostData( - Map<String, String> loginPostData) { - this.loginPostData = loginPostData; - return this; - } - - public Map<String, String> getAdditionalPostHeaders() { - return additionalPostHeaders == null ? new HashMap<String, String>() - : additionalPostHeaders; - } - - public HttpFormAuthConfigurer setAdditionalPostHeaders( - Map<String, String> additionalPostHeaders) { - this.additionalPostHeaders = additionalPostHeaders; - return this; - } - - public boolean isLoginRedirect() { - return loginRedirect; - } - - public HttpFormAuthConfigurer setLoginRedirect(boolean redirect) { - this.loginRedirect = redirect; - return this; - } - - public Set<String> getRemovedFormFields() { - return removedFormFields == null ? new HashSet<String>() - : removedFormFields; - } - - public HttpFormAuthConfigurer setRemovedFormFields( - Set<String> removedFormFields) { - this.removedFormFields = removedFormFields; - return this; } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java deleted file mode 100644 index 4c73f50..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java +++ /dev/null @@ -1,223 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.httpclient; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.net.CookieHandler; -import java.net.CookieManager; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.NameValuePair; -import org.apache.commons.httpclient.methods.GetMethod; -import org.apache.commons.httpclient.methods.PostMethod; -import org.apache.commons.io.IOUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class HttpFormAuthentication { - private static final Logger LOGGER = LoggerFactory - .getLogger(HttpFormAuthentication.class); - private static Map<String, String> defaultLoginHeaders = new HashMap<String, String>(); - - static { - defaultLoginHeaders.put("User-Agent", "Mozilla/5.0"); - defaultLoginHeaders - .put("Accept", - "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); - defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5"); - defaultLoginHeaders.put("Connection", "keep-alive"); - defaultLoginHeaders.put("Content-Type", - "application/x-www-form-urlencoded"); - } - - private HttpClient client; - private HttpFormAuthConfigurer authConfigurer = new HttpFormAuthConfigurer(); - private String cookies; - - public HttpFormAuthentication(HttpFormAuthConfigurer authConfigurer, - HttpClient client, Http http) { - this.authConfigurer = authConfigurer; - this.client = client; - defaultLoginHeaders.put("Accept", http.getAccept()); - defaultLoginHeaders.put("Accept-Language", http.getAcceptLanguage()); - defaultLoginHeaders.put("User-Agent", http.getUserAgent()); - } - - public HttpFormAuthentication(String loginUrl, String loginForm, - Map<String, String> loginPostData, - Map<String, String> additionalPostHeaders, - Set<String> removedFormFields) { - this.authConfigurer.setLoginUrl(loginUrl); - this.authConfigurer.setLoginFormId(loginForm); - this.authConfigurer - .setLoginPostData(loginPostData == null ? new HashMap<String, String>() - : loginPostData); - this.authConfigurer - .setAdditionalPostHeaders(additionalPostHeaders == null ? new HashMap<String, String>() - : additionalPostHeaders); - this.authConfigurer - .setRemovedFormFields(removedFormFields == null ? new HashSet<String>() - : removedFormFields); - this.client = new HttpClient(); - } - - public void login() throws Exception { - // make sure cookies are turned on - CookieHandler.setDefault(new CookieManager()); - String pageContent = httpGetPageContent(authConfigurer.getLoginUrl()); - List<NameValuePair> params = getLoginFormParams(pageContent); - sendPost(authConfigurer.getLoginUrl(), params); - } - - private void sendPost(String url, List<NameValuePair> params) - throws Exception { - PostMethod post = null; - try { - if (authConfigurer.isLoginRedirect()) { - post = new PostMethod(url) { - @Override - public boolean getFollowRedirects() { - return true; - } - }; - } else { - post = new PostMethod(url); - } - // we can't use post.setFollowRedirects(true) as it will throw - // IllegalArgumentException: - // Entity enclosing requests cannot be redirected without user - // intervention - setLoginHeader(post); - post.addParameters(params.toArray(new NameValuePair[0])); - int rspCode = client.executeMethod(post); - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("rspCode: " + rspCode); - LOGGER.debug("\nSending 'POST' request to URL : " + url); - - LOGGER.debug("Post parameters : " + params); - LOGGER.debug("Response Code : " + rspCode); - for (Header header : post.getRequestHeaders()) { - LOGGER.debug("Response headers : " + header); - } - } - String rst = IOUtils.toString(post.getResponseBodyAsStream()); - LOGGER.debug("login post result: " + rst); - } finally { - if (post != null) { - post.releaseConnection(); - } - } - } - - private void setLoginHeader(PostMethod post) { - Map<String, String> headers = new HashMap<String, String>(); - headers.putAll(defaultLoginHeaders); - // additionalPostHeaders can overwrite value in defaultLoginHeaders - headers.putAll(authConfigurer.getAdditionalPostHeaders()); - for (Entry<String, String> entry : headers.entrySet()) { - post.addRequestHeader(entry.getKey(), entry.getValue()); - } - post.addRequestHeader("Cookie", getCookies()); - } - - private String httpGetPageContent(String url) throws IOException { - - GetMethod get = new GetMethod(url); - try { - for (Entry<String, String> entry : authConfigurer - .getAdditionalPostHeaders().entrySet()) { - get.addRequestHeader(entry.getKey(), entry.getValue()); - } - client.executeMethod(get); - Header cookieHeader = get.getResponseHeader("Set-Cookie"); - if (cookieHeader != null) { - setCookies(cookieHeader.getValue()); - } - String rst = IOUtils.toString(get.getResponseBodyAsStream()); - return rst; - } finally { - get.releaseConnection(); - } - - } - - private List<NameValuePair> getLoginFormParams(String pageContent) - throws UnsupportedEncodingException { - List<NameValuePair> params = new ArrayList<NameValuePair>(); - Document doc = Jsoup.parse(pageContent); - Element loginform = doc.getElementById(authConfigurer.getLoginFormId()); - if (loginform == null) { - LOGGER.debug("No form element found with 'id' = {}, trying 'name'.", - authConfigurer.getLoginFormId()); - loginform = doc.select("form[name="+ authConfigurer.getLoginFormId() + "]").first(); - if (loginform == null) { - LOGGER.debug("No form element found with 'name' = {}", - authConfigurer.getLoginFormId()); - throw new IllegalArgumentException("No form exists: " - + authConfigurer.getLoginFormId()); - } - } - Elements inputElements = loginform.getElementsByTag("input"); - // skip fields in removedFormFields or loginPostData - for (Element inputElement : inputElements) { - String key = inputElement.attr("name"); - String value = inputElement.attr("value"); - if (authConfigurer.getLoginPostData().containsKey(key) - || authConfigurer.getRemovedFormFields().contains(key)) { - // value = loginPostData.get(key); - continue; - } - params.add(new NameValuePair(key, value)); - } - // add key and value in loginPostData - for (Entry<String, String> entry : authConfigurer.getLoginPostData() - .entrySet()) { - params.add(new NameValuePair(entry.getKey(), entry.getValue())); - } - return params; - } - - public String getCookies() { - return cookies; - } - - public void setCookies(String cookies) { - this.cookies = cookies; - } - - public boolean isRedirect() { - return authConfigurer.isLoginRedirect(); - } - - public void setRedirect(boolean redirect) { - this.authConfigurer.setLoginRedirect(redirect); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java deleted file mode 100644 index f074af2..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java +++ /dev/null @@ -1,216 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.httpclient; - -// JDK imports -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; - -// HTTP Client imports -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HttpVersion; -import org.apache.commons.httpclient.cookie.CookiePolicy; -import org.apache.commons.httpclient.methods.GetMethod; -import org.apache.commons.httpclient.params.HttpMethodParams; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpClient; - - -// Nutch imports -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.metadata.SpellCheckedMetadata; -import org.apache.nutch.net.protocols.HttpDateFormat; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.http.api.HttpBase; - -/** - * An HTTP response. - * - * @author Susam Pal - */ -public class HttpResponse implements Response { - - private URL url; - private byte[] content; - private int code; - private Metadata headers = new SpellCheckedMetadata(); - - /** - * Fetches the given <code>url</code> and prepares HTTP response. - * - * @param http - * An instance of the implementation class of this plugin - * @param url - * URL to be fetched - * @param datum - * Crawl data - * @param followRedirects - * Whether to follow redirects; follows redirect if and only if this - * is true - * @return HTTP response - * @throws IOException - * When an error occurs - */ - HttpResponse(Http http, URL url, CrawlDatum datum, boolean followRedirects) - throws IOException { - - // Prepare GET method for HTTP request - this.url = url; - GetMethod get = new GetMethod(url.toString()); - get.setFollowRedirects(followRedirects); - get.setDoAuthentication(true); - if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) { - get.setRequestHeader("If-Modified-Since", - HttpDateFormat.toString(datum.getModifiedTime())); - } - - // Set HTTP parameters - HttpMethodParams params = get.getParams(); - if (http.getUseHttp11()) { - params.setVersion(HttpVersion.HTTP_1_1); - } else { - params.setVersion(HttpVersion.HTTP_1_0); - } - params.makeLenient(); - params.setContentCharset("UTF-8"); - params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); - params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true); - // XXX (ab) not sure about this... the default is to retry 3 times; if - // XXX the request body was sent the method is not retried, so there is - // XXX little danger in retrying... - // params.setParameter(HttpMethodParams.RETRY_HANDLER, null); - try { - HttpClient client = Http.getClient(); - client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941 - code = client.executeMethod(get); - - Header[] heads = get.getResponseHeaders(); - - for (int i = 0; i < heads.length; i++) { - headers.set(heads[i].getName(), heads[i].getValue()); - } - - // Limit download size - int contentLength = Integer.MAX_VALUE; - String contentLengthString = headers.get(Response.CONTENT_LENGTH); - if (contentLengthString != null) { - try { - contentLength = Integer.parseInt(contentLengthString.trim()); - } catch (NumberFormatException ex) { - throw new HttpException("bad content length: " + contentLengthString); - } - } - if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { - contentLength = http.getMaxContent(); - } - - // always read content. Sometimes content is useful to find a cause - // for error. - InputStream in = get.getResponseBodyAsStream(); - try { - byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; - int bufferFilled = 0; - int totalRead = 0; - ByteArrayOutputStream out = new ByteArrayOutputStream(); - while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 - && totalRead + bufferFilled <= contentLength) { - totalRead += bufferFilled; - out.write(buffer, 0, bufferFilled); - } - - content = out.toByteArray(); - } catch (Exception e) { - if (code == 200) - throw new IOException(e.toString()); - // for codes other than 200 OK, we are fine with empty content - } finally { - if (in != null) { - in.close(); - } - get.abort(); - } - - StringBuilder fetchTrace = null; - if (Http.LOG.isTraceEnabled()) { - // Trace message - fetchTrace = new StringBuilder("url: " + url + "; status code: " + code - + "; bytes received: " + content.length); - if (getHeader(Response.CONTENT_LENGTH) != null) - fetchTrace.append("; Content-Length: " - + getHeader(Response.CONTENT_LENGTH)); - if (getHeader(Response.LOCATION) != null) - fetchTrace.append("; Location: " + getHeader(Response.LOCATION)); - } - // Extract gzip, x-gzip and deflate content - if (content != null) { - // check if we have to uncompress it - String contentEncoding = headers.get(Response.CONTENT_ENCODING); - if (contentEncoding != null && Http.LOG.isTraceEnabled()) - fetchTrace.append("; Content-Encoding: " + contentEncoding); - if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { - content = http.processGzipEncoded(content, url); - if (Http.LOG.isTraceEnabled()) - fetchTrace.append("; extracted to " + content.length + " bytes"); - } else if ("deflate".equals(contentEncoding)) { - content = http.processDeflateEncoded(content, url); - if (Http.LOG.isTraceEnabled()) - fetchTrace.append("; extracted to " + content.length + " bytes"); - } - } - - // Logger trace message - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace(fetchTrace.toString()); - } - } finally { - get.releaseConnection(); - } - } - - /* - * ------------------------- * <implementation:Response> * - * ------------------------- - */ - - public URL getUrl() { - return url; - } - - public int getCode() { - return code; - } - - public String getHeader(String name) { - return headers.get(name); - } - - public Metadata getHeaders() { - return headers; - } - - public byte[] getContent() { - return content; - } - - /* - * -------------------------- * </implementation:Response> * - * -------------------------- - */ -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html deleted file mode 100644 index 9cbcb14..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html +++ /dev/null @@ -1,9 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the HTTP and -HTTPS protocols, optionally with Basic, Digest and NTLM authentication -schemes for web server as well as proxy server. It handles cookies -within a single fetch operation. This plugin is based on Jakarta -Commons HttpClient library.</p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml b/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml deleted file mode 100644 index 3c0203b..0000000 --- a/src/plugin/protocol-httpclient/src/test/conf/httpclient-auth-test.xml +++ /dev/null @@ -1,58 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<auth-configuration> - - <!-- Default credentials --> - <credentials username="userx" password="passx"> - <default/> - <authscope host="127.0.0.1" port="47500"/> - </credentials> - - <!-- Defined a realm for 127.0.0.1:47501 so that authentication for - other realms fail (except another realm for 127.0.0.1:47501 is - defined below for NTLM scheme). --> - <credentials username="userx" password="passx"> - <authscope host="127.0.0.1" port="47501" realm="realmx" - scheme="BASIC"/> - </credentials> - - <!-- Test case for NTLM authentication scheme. --> - <credentials username="ntlm_user" password="ntlm_pass"> - <authscope host="127.0.0.1" port="47501" realm="NUTCH" - scheme="NTLM"/> - </credentials> - - <!-- Test case for credentials selection based on scheme (realm1 is - present in basic.jsp as well as digest.jsp). - Also tests Digest authentication scheme. --> - <credentials username="digest_user" password="digest_pass"> - <authscope host="127.0.0.1" port="47500" realm="realm1" - scheme="DIGEST"/> - </credentials> - - <!-- Test case for Basic authentication scheme. --> - <credentials username="user1" password="pass1"> - <authscope host="127.0.0.1" port="47500" realm="realm1"/> - </credentials> - <credentials username="user2" password="pass2"> - <authscope host="127.0.0.1" port="47500" realm="realm2"/> - </credentials> - -</auth-configuration> - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml b/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml deleted file mode 100644 index 856ea15..0000000 --- a/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml +++ /dev/null @@ -1,52 +0,0 @@ -<?xml version="1.0"?> -<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<configuration> - -<property> - <name>http.robots.agents</name> - <value>Nutch-Test,*</value> - <description></description> -</property> - -<property> - <name>http.agent.name</name> - <value>Nutch-Test</value> - <description></description> -</property> - -<property> - <name>http.agent.description</name> - <value>Nutch protocol-httpclient test</value> - <description></description> -</property> - -<property> - <name>http.auth.file</name> - <value>httpclient-auth-test.xml</value> - <description></description> -</property> - -<property> - <name>http.timeout</name> - <value>60000</value> - <description></description> -</property> - -</configuration> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java deleted file mode 100644 index 783e5af..0000000 --- a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.httpclient; - -import java.net.URL; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.ContextHandler; -import org.mortbay.jetty.servlet.ServletHandler; -import org.mortbay.jetty.servlet.SessionHandler; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; - -/** - * Test cases for protocol-httpclient. - */ -public class TestProtocolHttpClient { - - private Server server; - private Configuration conf; - private static final String RES_DIR = System.getProperty("test.data", "."); - private int port; - private Http http = new Http(); - - @Before - public void setUp() throws Exception { - - ContextHandler context = new ContextHandler(); - context.setContextPath("/"); - context.setResourceBase(RES_DIR); - ServletHandler sh = new ServletHandler(); - sh.addServletWithMapping("org.apache.jasper.servlet.JspServlet", "*.jsp"); - context.addHandler(sh); - context.addHandler(new SessionHandler()); - - server = new Server(); - server.addHandler(context); - - conf = new Configuration(); - conf.addResource("nutch-default.xml"); - conf.addResource("nutch-site-test.xml"); - - http = new Http(); - http.setConf(conf); - } - - @After - public void tearDown() throws Exception { - server.stop(); - for (int i = 0; i < 5; i++) { - if (!server.isStopped()) { - Thread.sleep(1000); - } - } - } - - /** - * Tests whether the client can remember cookies. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testCookies() throws Exception { - startServer(47500); - fetchPage("/cookies.jsp", 200); - fetchPage("/cookies.jsp?cookie=yes", 200); - } - - /** - * Tests that no pre-emptive authorization headers are sent by the client. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testNoPreemptiveAuth() throws Exception { - startServer(47500); - fetchPage("/noauth.jsp", 200); - } - - /** - * Tests default credentials. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testDefaultCredentials() throws Exception { - startServer(47502); - fetchPage("/basic.jsp", 200); - } - - /** - * Tests basic authentication scheme for various realms. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testBasicAuth() throws Exception { - startServer(47500); - fetchPage("/basic.jsp", 200); - fetchPage("/basic.jsp?case=1", 200); - fetchPage("/basic.jsp?case=2", 200); - server.start(); - } - - /** - * Tests that authentication happens for a defined realm and not for other - * realms for a host:port when an extra <code>authscope</code> tag is not - * defined to match all other realms. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testOtherRealmsNoAuth() throws Exception { - startServer(47501); - fetchPage("/basic.jsp", 200); - fetchPage("/basic.jsp?case=1", 401); - fetchPage("/basic.jsp?case=2", 401); - } - - /** - * Tests Digest authentication scheme. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testDigestAuth() throws Exception { - startServer(47500); - fetchPage("/digest.jsp", 200); - } - - /** - * Tests NTLM authentication scheme. - * - * @throws Exception - * If an error occurs or the test case fails. - */ - @Test - public void testNtlmAuth() throws Exception { - startServer(47501); - fetchPage("/ntlm.jsp", 200); - } - - /** - * Starts the Jetty server at a specified port. - * - * Will try up to 10 ports to find an available port to use. - * - * @param portno - * Port number. - * @throws Exception - * When an error occurs. - */ - private void startServer(int portno) throws Exception { - SocketConnector listener = new SocketConnector(); - listener.setHost("127.0.0.1"); - server.addConnector(listener); - for (int p = portno; p < portno + 10; p++) { - port = portno; - listener.setPort(port); - try { - server.start(); - break; - } catch (Exception e) { - if (p == portno + 9) { - throw e; - } - } - } - } - - /** - * Fetches the specified <code>page</code> from the local Jetty server and - * checks whether the HTTP response status code matches with the expected - * code. - * - * @param page - * Page to be fetched. - * @param expectedCode - * HTTP response status code expected while fetching the page. - * @throws Exception - * When an error occurs or test case fails. - */ - private void fetchPage(String page, int expectedCode) throws Exception { - URL url = new URL("http", "127.0.0.1", port, page); - Response response = null; - response = http.getResponse(url, new CrawlDatum(), true); - - int code = response.getCode(); - Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/README.md ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/README.md b/src/plugin/protocol-interactiveselenium/README.md deleted file mode 100644 index dd43ee7..0000000 --- a/src/plugin/protocol-interactiveselenium/README.md +++ /dev/null @@ -1,38 +0,0 @@ -Nutch Interactive Selenium -========================== - -This protocol plugin allows you to fetch and interact with pages using [Selenium](http://www.seleniumhq.org/). - -# Dependencies and Configuration - -You will need to have [Selenium](http://www.seleniumhq.org/) and a compatible version of Firefox installed to use this plugin. - -Set the protocol to be used in your Nutch configuration files. -``` -<!-- NUTCH_HOME/conf/nutch-site.xml --> - -<configuration> - ... - <property> - <name>plugin.includes</name> - <value>protocol-interactiveselenium|urlfilter-regex| ... </value> - <description></description> - </property> -``` - -# Custom Handlers - -Only basic functionality is included in the DefaultHandler that comes with the plugin. If you want additional functionality you can implement custom handlers by implementing the InteractiveSeleniumHandler interface in the plugin package. Be sure to also update the plugin config to include your new handler. - -``` -<!-- NUTCH_HOME/conf/nutch-site.xml --> -<property> - <name>interactiveselenium.handlers</name> - <value>NewCustomHandler,DefaultHandler</value> - <description></description> -</property> -``` - -# Handler Info - -Handlers are called in the order that they're specified in the configuration. A "clean" driver is used for each handler so multiple handlers won't interfere with each other. Page content is appended together from each handler and returned for the request. http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/build-ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/build-ivy.xml b/src/plugin/protocol-interactiveselenium/build-ivy.xml deleted file mode 100644 index 9f96619..0000000 --- a/src/plugin/protocol-interactiveselenium/build-ivy.xml +++ /dev/null @@ -1,54 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="protocol-interactiveselenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> - - <property name="ivy.install.version" value="2.1.0" /> - <condition property="ivy.home" value="${env.IVY_HOME}"> - <isset property="env.IVY_HOME" /> - </condition> - <property name="ivy.home" value="${user.home}/.ant" /> - <property name="ivy.checksums" value="" /> - <property name="ivy.jar.dir" value="${ivy.home}/lib" /> - <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> - - <target name="download-ivy" unless="offline"> - - <mkdir dir="${ivy.jar.dir}"/> - <!-- download Ivy from web site so that it can be used even without any special installation --> - <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" - dest="${ivy.jar.file}" usetimestamp="true"/> - </target> - - <target name="init-ivy" depends="download-ivy"> - <!-- try to load ivy here from ivy home, in case the user has not already dropped - it into ant's lib dir (note that the latter copy will always take precedence). - We will not fail as long as local lib dir exists (it may be empty) and - ivy is in at least one of ant's lib dir or the local lib dir. --> - <path id="ivy.lib.path"> - <fileset dir="${ivy.jar.dir}" includes="*.jar"/> - - </path> - <taskdef resource="org/apache/ivy/ant/antlib.xml" - uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> - </target> - - <target name="deps-jar" depends="init-ivy"> - <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/build.xml b/src/plugin/protocol-interactiveselenium/build.xml deleted file mode 100644 index 69dab90..0000000 --- a/src/plugin/protocol-interactiveselenium/build.xml +++ /dev/null @@ -1,37 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="protocol-interactiveselenium" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Build compilation dependencies --> - <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../lib-http"/> - <ant target="jar" inheritall="false" dir="../lib-selenium"/> - </target> - - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/lib-http/*.jar" /> - <include name="**/lib-selenium/*.jar" /> - <include name="**/protocol-selenium/*.jar" /> - </fileset> - </path> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/ivy.xml b/src/plugin/protocol-interactiveselenium/ivy.xml deleted file mode 100644 index ff07f8c..0000000 --- a/src/plugin/protocol-interactiveselenium/ivy.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../../ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="default"/> - </publications> - - <dependencies> - <!-- Note: only dependencies which are not contained in lib-selenium have to be listed here! --> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/plugin.xml b/src/plugin/protocol-interactiveselenium/plugin.xml deleted file mode 100644 index a69a1e5..0000000 --- a/src/plugin/protocol-interactiveselenium/plugin.xml +++ /dev/null @@ -1,47 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="protocol-interactiveselenium" - name="Http Protocol Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="protocol-interactiveselenium.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - <import plugin="lib-http"/> - <import plugin="lib-selenium"/> - </requires> - - <extension id="org.apache.nutch.protocol.interactiveselenium" - name="HttpProtocol" - point="org.apache.nutch.protocol.Protocol"> - - <implementation id="org.apache.nutch.protocol.interactiveselenium.Http" - class="org.apache.nutch.protocol.interactiveselenium.Http"> - <parameter name="protocolName" value="http"/> - </implementation> - - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java deleted file mode 100644 index 9449fa1..0000000 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.interactiveselenium; - -// JDK imports -import java.io.IOException; -import java.net.URL; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.http.api.HttpBase; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.util.NutchConfiguration; - -import org.apache.nutch.protocol.interactiveselenium.HttpResponse; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class Http extends HttpBase { - - public static final Logger LOG = LoggerFactory.getLogger(Http.class); - - public Http() { - super(LOG); - } - - @Override - public void setConf(Configuration conf) { - super.setConf(conf); - } - - public static void main(String[] args) throws Exception { - Http http = new Http(); - http.setConf(NutchConfiguration.create()); - main(http, args); - } - - @Override - protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) - throws ProtocolException, IOException { - return new HttpResponse(this, url, datum); - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java deleted file mode 100644 index a1ccf29..0000000 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java +++ /dev/null @@ -1,399 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.interactiveselenium; - -// JDK imports -import java.io.BufferedInputStream; -import java.io.EOFException; -import java.io.IOException; -import java.io.OutputStream; -import java.io.ByteArrayOutputStream; -import java.io.PushbackInputStream; -import java.net.InetSocketAddress; -import java.net.Socket; -import java.net.URL; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.metadata.SpellCheckedMetadata; -import org.apache.nutch.net.protocols.HttpDateFormat; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.http.api.HttpException; -import org.apache.nutch.protocol.http.api.HttpBase; -import org.openqa.selenium.WebDriver; - -import org.apache.nutch.protocol.selenium.HttpWebClient; - -/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */ - -public class HttpResponse implements Response { - - private Http http; - private URL url; - private String orig; - private String base; - private byte[] content; - private int code; - private Metadata headers = new SpellCheckedMetadata(); - private static InteractiveSeleniumHandler[] handlers; - - /** The nutch configuration */ - private Configuration conf = null; - - public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException { - - this.conf = http.getConf(); - this.http = http; - this.url = url; - this.orig = url.toString(); - this.base = url.toString(); - - if (!"http".equals(url.getProtocol())) - throw new HttpException("Not an HTTP url:" + url); - - if (Http.LOG.isTraceEnabled()) { - Http.LOG.trace("fetching " + url); - } - - String path = "".equals(url.getFile()) ? "/" : url.getFile(); - - // some servers will redirect a request with a host line like - // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they - // don't want the :80... - - String host = url.getHost(); - int port; - String portString; - if (url.getPort() == -1) { - port = 80; - portString = ""; - } else { - port = url.getPort(); - portString = ":" + port; - } - Socket socket = null; - - try { - socket = new Socket(); // create the socket - socket.setSoTimeout(http.getTimeout()); - - // connect - String sockHost = http.useProxy(url) ? http.getProxyHost() : host; - int sockPort = http.useProxy(url) ? http.getProxyPort() : port; - InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); - socket.connect(sockAddr, http.getTimeout()); - - // make request - OutputStream req = socket.getOutputStream(); - - StringBuffer reqStr = new StringBuffer("GET "); - if (http.useProxy(url)) { - reqStr.append(url.getProtocol() + "://" + host + portString + path); - } else { - reqStr.append(path); - } - - reqStr.append(" HTTP/1.0\r\n"); - - reqStr.append("Host: "); - reqStr.append(host); - reqStr.append(portString); - reqStr.append("\r\n"); - - reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); - - String userAgent = http.getUserAgent(); - if ((userAgent == null) || (userAgent.length() == 0)) { - if (Http.LOG.isErrorEnabled()) { - Http.LOG.error("User-agent is not set!"); - } - } else { - reqStr.append("User-Agent: "); - reqStr.append(userAgent); - reqStr.append("\r\n"); - } - - reqStr.append("Accept-Language: "); - reqStr.append(this.http.getAcceptLanguage()); - reqStr.append("\r\n"); - - reqStr.append("Accept: "); - reqStr.append(this.http.getAccept()); - reqStr.append("\r\n"); - - if (datum.getModifiedTime() > 0) { - reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime())); - reqStr.append("\r\n"); - } - reqStr.append("\r\n"); - - byte[] reqBytes = reqStr.toString().getBytes(); - - req.write(reqBytes); - req.flush(); - - PushbackInputStream in = // process response - new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE), - Http.BUFFER_SIZE); - - StringBuffer line = new StringBuffer(); - - boolean haveSeenNonContinueStatus = false; - while (!haveSeenNonContinueStatus) { - // parse status code line - this.code = parseStatusLine(in, line); - // parse headers - parseHeaders(in, line); - haveSeenNonContinueStatus = code != 100; // 100 is "Continue" - } - - // Get Content type header - String contentType = getHeader(Response.CONTENT_TYPE); - - // handle with Selenium only if content type in HTML or XHTML - if (contentType != null) { - if (contentType.contains("text/html") || contentType.contains("application/xhtml")) { - readPlainContent(url); - } else { - try { - int contentLength = Integer.MAX_VALUE; - String contentLengthString = headers.get(Response.CONTENT_LENGTH); - if (contentLengthString != null) { - try { - contentLength = Integer.parseInt(contentLengthString.trim()); - } catch (NumberFormatException ex) { - throw new HttpException("bad content length: " + contentLengthString); - } - } - - if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { - contentLength = http.getMaxContent(); - } - - byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; - int bufferFilled = 0; - int totalRead = 0; - ByteArrayOutputStream out = new ByteArrayOutputStream(); - while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 - && totalRead + bufferFilled <= contentLength) { - totalRead += bufferFilled; - out.write(buffer, 0, bufferFilled); - } - - content = out.toByteArray(); - - } catch (Exception e) { - if (code == 200) - throw new IOException(e.toString()); - // for codes other than 200 OK, we are fine with empty content - } finally { - if (in != null) { - in.close(); - } - } - } - } - - } finally { - if (socket != null) - socket.close(); - } - } - - /* ------------------------- * - * <implementation:Response> * - * ------------------------- */ - - public URL getUrl() { - return url; - } - - public int getCode() { - return code; - } - - public String getHeader(String name) { - return headers.get(name); - } - - public Metadata getHeaders() { - return headers; - } - - public byte[] getContent() { - return content; - } - - /* ------------------------- * - * <implementation:Response> * - * ------------------------- */ - private void loadSeleniumHandlers() { - if (handlers != null) return; - - String handlerConfig = this.conf.get("interactiveselenium.handlers", "DefaultHandler"); - String[] handlerNames = handlerConfig.split(","); - handlers = new InteractiveSeleniumHandler[handlerNames.length]; - for (int i = 0; i < handlerNames.length; i++) { - try { - String classToLoad = this.getClass().getPackage().getName() + "." + handlerNames[i]; - handlers[i] = InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).newInstance()); - Http.LOG.info("Successfully loaded " + classToLoad); - } catch (ClassNotFoundException e) { - Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]); - } catch (InstantiationException e) { - Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]); - } catch (IllegalAccessException e) { - Http.LOG.info("Illegal access with Handler: " + handlerNames[i]); - } - } - } - - private void readPlainContent(URL url) throws IOException { - if (handlers == null) - loadSeleniumHandlers(); - - String processedPage = ""; - - for (InteractiveSeleniumHandler handler : this.handlers) { - if (! handler.shouldProcessURL(url.toString())) { - continue; - } - - WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf); - - processedPage += handler.processDriver(driver); - - HttpWebClient.cleanUpDriver(driver); - } - - content = processedPage.getBytes("UTF-8"); - } - - private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { - readLine(in, line, false); - - int codeStart = line.indexOf(" "); - int codeEnd = line.indexOf(" ", codeStart + 1); - - // handle lines with no plaintext result code, ie: - // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" - if (codeEnd == -1) - codeEnd = line.length(); - - int code; - try { - code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); - } catch (NumberFormatException e) { - throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e); - } - - return code; - } - - private void processHeaderLine(StringBuffer line) throws IOException, HttpException { - - int colonIndex = line.indexOf(":"); // key is up to colon - if (colonIndex == -1) { - int i; - for (i = 0; i < line.length(); i++) - if (!Character.isWhitespace(line.charAt(i))) - break; - if (i == line.length()) - return; - throw new HttpException("No colon in header:" + line); - } - String key = line.substring(0, colonIndex); - - int valueStart = colonIndex + 1; // skip whitespace - while (valueStart < line.length()) { - int c = line.charAt(valueStart); - if (c != ' ' && c != '\t') - break; - valueStart++; - } - String value = line.substring(valueStart); - headers.set(key, value); - } - - // Adds headers to our headers Metadata - private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { - - while (readLine(in, line, true) != 0) { - - // handle HTTP responses with missing blank line after headers - int pos; - if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1) - || ((pos = line.indexOf("<html")) != -1)) { - - in.unread(line.substring(pos).getBytes("UTF-8")); - line.setLength(pos); - - try { - //TODO: (CM) We don't know the header names here - //since we're just handling them generically. It would - //be nice to provide some sort of mapping function here - //for the returned header names to the standard metadata - //names in the ParseData class - processHeaderLine(line); - } catch (Exception e) { - // fixme: - Http.LOG.warn("Error: ", e); - } - return; - } - - processHeaderLine(line); - } - } - - private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine) - throws IOException { - line.setLength(0); - for (int c = in.read(); c != -1; c = in.read()) { - switch (c) { - case '\r': - if (peek(in) == '\n') { - in.read(); - } - case '\n': - if (line.length() > 0) { - // at EOL -- check for continued line if the current - // (possibly continued) line wasn't blank - if (allowContinuedLine) - switch (peek(in)) { - case ' ': - case '\t': // line is continued - in.read(); - continue; - } - } - return line.length(); // else complete - default: - line.append((char) c); - } - } - throw new EOFException(); - } - - private static int peek(PushbackInputStream in) throws IOException { - int value = in.read(); - in.unread(value); - return value; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java deleted file mode 100644 index f3c0f6f..0000000 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.interactiveselenium; - -import org.apache.hadoop.util.StringUtils; -import org.openqa.selenium.JavascriptExecutor; -import org.openqa.selenium.WebDriver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This is a placeholder/example of a technique or use case where we do multiple - * interaction with the web driver and need data from each such interaction in the end. This code shows that after you have - * done multiple interactions and accumulated data you can in the end append that to the driver. - */ -public class DefalultMultiInteractionHandler implements - InteractiveSeleniumHandler { - private static final Logger LOG = LoggerFactory - .getLogger(DefalultMultiInteractionHandler.class); - - public String processDriver(WebDriver driver) { - // loop and get multiple pages in this string - String accumulatedData = ""; - try { - - // append the string to the last page's driver - JavascriptExecutor jsx = (JavascriptExecutor) driver; - jsx.executeScript("document.body.innerHTML=document.body.innerHTML " - + accumulatedData + ";"); - } catch (Exception e) { - LOG.info(StringUtils.stringifyException(e)); - } - return accumulatedData; - } - - public boolean shouldProcessURL(String URL) { - return true; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java deleted file mode 100644 index e3423d5..0000000 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.interactiveselenium; - -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.util.StringUtils; -import org.apache.nutch.util.NutchConfiguration; -import org.openqa.selenium.By; -import org.openqa.selenium.JavascriptExecutor; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.WebElement; -import org.openqa.selenium.support.ui.WebDriverWait; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This handler clicks all the <a hfer="javascript:void(null);"> tags - * because it considers them as not usual links but ajax links/interactions. This uses the same logic of - * DefalultMultiInteractionHandler. - */ -public class DefaultClickAllAjaxLinksHandler implements InteractiveSeleniumHandler { - private static final Logger LOG = LoggerFactory - .getLogger(DefaultClickAllAjaxLinksHandler.class); - - public String processDriver(WebDriver driver) { - - String accumulatedData = ""; - try { - - - driver.findElement(By.tagName("body")).getAttribute("innerHTML"); - Configuration conf = NutchConfiguration.create(); - new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3)); - - List<WebElement> atags = driver.findElements(By.tagName("a")); - int numberofajaxlinks = atags.size(); - for (int i = 0; i < numberofajaxlinks; i++) { - - if (atags.get(i).getAttribute("href") != null - && atags.get(i).getAttribute("href") - .equals("javascript:void(null);")) { - - atags.get(i).click(); - - if (i == numberofajaxlinks - 1) { - // append everything to the driver in the last round - JavascriptExecutor jsx = (JavascriptExecutor) driver; - jsx.executeScript("document.body.innerHTML=document.body.innerHTML " - + accumulatedData + ";"); - continue; - } - - accumulatedData += driver.findElement(By.tagName("body")) - .getAttribute("innerHTML"); - - // refreshing the handlers as the page was interacted with - driver.navigate().refresh(); - new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", - 3)); - atags = driver.findElements(By.tagName("a")); - } - } - } catch (Exception e) { - LOG.info(StringUtils.stringifyException(e)); - } - return accumulatedData; - } - - public boolean shouldProcessURL(String URL) { - return true; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java deleted file mode 100644 index ae7b97e..0000000 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.interactiveselenium; - -import org.openqa.selenium.WebDriver; - -public class DefaultHandler implements InteractiveSeleniumHandler { - public String processDriver(WebDriver driver) { - return null; - } - - public boolean shouldProcessURL(String URL) { - return true; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java deleted file mode 100644 index 9ce1e26..0000000 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.protocol.interactiveselenium; - -import org.openqa.selenium.WebDriver; - -public interface InteractiveSeleniumHandler { - public String processDriver(WebDriver driver); - public boolean shouldProcessURL(String URL); -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html deleted file mode 100644 index 75cd5b5..0000000 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-selenium/README.md ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-selenium/README.md b/src/plugin/protocol-selenium/README.md deleted file mode 100644 index 1462b47..0000000 --- a/src/plugin/protocol-selenium/README.md +++ /dev/null @@ -1,208 +0,0 @@ -Nutch Selenium -============== - -# Introduction - -This plugin allows you to fetch Javascript pages using [Selenium](http://www.seleniumhq.org/), while relying on the rest of the awesome Nutch stack! - -The underlying code is based on the nutch-htmlunit plugin, which was in turn based on nutch-httpclient. - -There are essentially two ways in which Nutch can be used with Selenium. - - * Locally (on each node) as a self contained process, or - * via the RemoteWebDriver which connects to [Selenium-Grid](http://www.seleniumhq.org/docs/07_selenium_grid.jsp). A grid consists of a single hub, and one or more nodes. - -# Installation - -## Part 1: - -### A) Setting up Selenium (local mode) - - * Ensure that you have your prefered browser installed. Currently Chrome, Safari, Opera, PhantomJS and Firefox are supported. Here there example of installing Firefox is provided. More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox) -``` -sudo apt-get install firefox -``` - - * Install Xvfb and its associates - -This step is not necessary for the PhantomJs broswer and may not be needed for all browsers. - -``` -sudo apt-get install xorg synaptic xvfb gtk2-engines-pixbuf xfonts-cyrillic xfonts-100dpi \ - xfonts-75dpi xfonts-base xfonts-scalable freeglut3-dev dbus-x11 openbox x11-xserver-utils \ - libxrender1 cabextract -``` - - * Set a display for Xvfb, so that firefox believes a display is connected - -``` -sudo /usr/bin/Xvfb :11 -screen 0 1024x768x24 & -sudo export DISPLAY=:11 -``` -### B) Setting up a Selenium Grid - -Using the Selenium Grid will allow you to parallelize the job by facilitating access of several instances of browsers whether on one machine or on several machines. Note that grid facilitates heterogeneity with regards to browser types used. However, these steps have been tested using a homogenous Selenium Grid with Firefox and PhantomJS browsers. - - * Download the [Selenium Standalone Server](http://www.seleniumhq.org/download/) and follow the installation instructions. - - * Some important configurations to note while setting up the selenium-hub and the selenium-nodes are: - * For the hub: - - maxSession (how many browser sessions to allow on the grid at a time) - - browserTimeout (how long to wait before timing out a browser session. This is dependent on the interactivity to be completed on the page) - - * For the nodes: - - browserName=<browser>, maxInstances (the max number of instances of the same version browser to allow per a system) - - browserName=<browser>, maxSession (the max number of sessions of any type of browser/version to allow per a system) - - * Go headless with your selenium Grid installation. There are different ways to this. See [this resource](http://elementalselenium.com/tips/38-headless) for further details. - - * For Nutch efficiency, and optimization of the grid, consider editing the following configs in **nutch-site.xml** - - fetcher.threads.per.queue (change value to the value of the maxSession config on the hub) - - fetcher.threads.fetch (change value to the value of the maxSession config on the hub) - - fetcher.server.delay (As multiple threads may be accessing a single server at a time, consider changing this value to 4-5 seconds for politeness) - - fetcher.server.min.delay (As multiple threads may be accessing a single server at a time, consider changing this values to 4-5 seconds for politeness) - - Ensure all configs for the hub mentioned in Part 2 are appropriately set. - - * To activate the full selenium grid, edit **$NUTCH_HOME/runtime/local/bin/crawl** script: - - numThreads = maxSession on nodes * num of nodes - - -## Part 2: Installing plugin for Nutch (where NUTCH_HOME is the root of your nutch install) - - * Ensure that the plugin will be used as the protocol parser in your config - -``` -<!-- NUTCH_HOME/conf/nutch-site.xml --> - -<configuration> - ... - <property> - <name>plugin.includes</name> - <value>protocol-selenium|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value> - <description>Regular expression naming plugin directory names to - include. Any plugin not matching this expression is excluded. - In any case you need at least include the nutch-extensionpoints plugin. By - default Nutch includes crawling just HTML and plain text via HTTP, - and basic indexing and search plugins. In order to use HTTPS please enable - protocol-httpclient, but be aware of possible intermittent problems with the - underlying commons-httpclient library. - </description> - </property> -``` - -* Then ensure that you have the correct configuration set within the following configuration options - -``` -<!-- protocol-selenium plugin properties --> - -<property> - <name>selenium.driver</name> - <value>firefox</value> - <description> - A String value representing the flavour of Selenium - WebDriver() to use. Currently the following options - exist - 'firefox', 'chrome', 'safari', 'opera', 'phantomJS', and 'remote'. - If 'remote' is used it is essential to also set correct properties for - 'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and - 'selenium.hub.protocol'. - </description> -</property> - -<property> - <name>selenium.take.screenshot</name> - <value>false</value> - <description> - Boolean property determining whether the protocol-selenium - WebDriver should capture a screenshot of the URL. If set to - true remember to define the 'selenium.screenshot.location' - property as this determines the location screenshots should be - persisted to on HDFS. If that property is not set, screenshots - are simply discarded. - </description> -</property> - -<property> - <name>selenium.screenshot.location</name> - <value></value> - <description> - The location on disk where a URL screenshot should be saved - to if the 'selenium.take.screenshot' proerty is set to true. - By default this is null, in this case screenshots held in memory - are simply discarded. - </description> -</property> - -<property> - <name>selenium.hub.port</name> - <value>4444</value> - <description>Selenium Hub Location connection port</description> -</property> - -<property> - <name>selenium.hub.path</name> - <value>/wd/hub</value> - <description>Selenium Hub Location connection path</description> -</property> - -<property> - <name>selenium.hub.host</name> - <value>localhost</value> - <description>Selenium Hub Location connection host</description> -</property> - -<property> - <name>selenium.hub.protocol</name> - <value>http</value> - <description>Selenium Hub Location connection protocol</description> -</property> - -<property> - <name>selenium.grid.driver</name> - <value>firefox</value> - <description>A String value representing the flavour of Selenium - WebDriver() used on the selenium grid. Currently the following options - exist - 'firefox' or 'phantomJS' </description> -</property> - -<property> - <name>selenium.grid.binary</name> - <value></value> - <description>A String value representing the path to the browser binary - location for each node - </description> -</property> - -<!-- lib-selenium configuration --> -<property> - <name>libselenium.page.load.delay</name> - <value>3</value> - <description> - The delay in seconds to use when loading a page with lib-selenium. This - setting is used by protocol-selenium and protocol-interactiveselenium - since they depending on lib-selenium for fetching. - </description> -</property> -``` - * If you've selected 'remote' value for the 'selenium.driver' property, ensure that you've configured - the additional properties based on your [Selenium-Grid installation](http://www.seleniumhq.org/docs/07_selenium_grid.jsp#installation). - - * Compile nutch -``` -ant runtime -``` - - * Start your web crawl (Ensure that you followed the above steps and have started your xvfb display as shown above) - -## Part 3: Common Pitfalls - -* Be sure your browser version and selenium version are compatible (See list in 'Tested configurations' section below) -* Be sure to start the Xvfb window then start selenium (not a necessary step for PhantomJS) -* Disconnecting and reconnect nodes after a hub config change has proven useful in our tests. -* Be sure that each browser session deallocates its webdriver resource independently of any other tests running on other broswers (check out driver.quit() and driver.close()). - -### Tested configurations - -* Firefox 31.4.0 and Selenium 2.48.2 -* PhantomJS 2.1.1 and Selenium 2.48.2 -* PhantomJS 2.1.1 and Selenium 2.53.0 -
