Repository: nutch Updated Branches: refs/heads/master fda3e148b -> 9f32fe84a
fix the cookie policy issue when the form authentication receives session cookie in a non-standard format - NUTCH-2280 Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/993e997e Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/993e997e Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/993e997e Branch: refs/heads/master Commit: 993e997e2d5795c0623cdf5614d02c7a8ce405d5 Parents: 5943d11 Author: Steve Yao <[email protected]> Authored: Tue Jul 12 19:41:10 2016 +0800 Committer: Steve Yao <[email protected]> Committed: Tue Jul 12 19:41:10 2016 +0800 ---------------------------------------------------------------------- .../apache/nutch/protocol/httpclient/Http.java | 79 ++++++++++++-------- .../httpclient/HttpFormAuthConfigurer.java | 21 +++++- .../httpclient/HttpFormAuthentication.java | 28 +++++++ 3 files changed, 95 insertions(+), 33 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java index 75506ce..9b91180 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java @@ -69,16 +69,16 @@ import org.apache.nutch.util.NutchConfiguration; * session. * </p> * <p> - * Documentation can be found on the Nutch <a - * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes" - * >HttpAuthenticationSchemes</a> wiki page. + * Documentation can be found on the Nutch + * <a href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes" > + * HttpAuthenticationSchemes</a> wiki page. * </p> * <p> - * The original description of the motivation to support <a - * href="https://wiki.apache.org/nutch/HttpPostAuthentication" - * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally - * HttpPostAuthentication development is documented at the <a - * href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira + * The original description of the motivation to support + * <a href="https://wiki.apache.org/nutch/HttpPostAuthentication" > + * HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally + * HttpPostAuthentication development is documented at the + * <a href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira * issue. * * @author Susam Pal @@ -146,6 +146,7 @@ public class Http extends HttpBase { setCredentials(); } catch (Exception ex) { if (LOG.isErrorEnabled()) { + LOG.error("Http ", ex); LOG.error("Could not read " + authFile + " : " + ex.getMessage()); } } @@ -202,15 +203,15 @@ public class Http extends HttpBase { // NUTCH-1836: Modification to increase the number of available connections // for multi-threaded crawls. // -------------------------------------------------------------------------------- - params.setMaxTotalConnections(conf.getInt( - "mapred.tasktracker.map.tasks.maximum", 5) - * conf.getInt("fetcher.threads.fetch", maxThreadsTotal)); + params.setMaxTotalConnections( + conf.getInt("mapred.tasktracker.map.tasks.maximum", 5) + * conf.getInt("fetcher.threads.fetch", maxThreadsTotal)); // Also set max connections per host to maxThreadsTotal since all threads // might be used to fetch from the same host - otherwise timeout errors can // occur - params.setDefaultMaxConnectionsPerHost(conf.getInt( - "fetcher.threads.fetch", maxThreadsTotal)); + params.setDefaultMaxConnectionsPerHost( + conf.getInt("fetcher.threads.fetch", maxThreadsTotal)); // executeMethod(HttpMethod) seems to ignore the connection timeout on the // connection manager. @@ -226,10 +227,8 @@ public class Http extends HttpBase { // prefer UTF-8 headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7")); // prefer understandable formats - headers - .add(new Header( - "Accept", - "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5")); + headers.add(new Header("Accept", + "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5")); // accept gzipped content headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate")); hostConf.getParams().setParameter("http.default-headers", headers); @@ -266,7 +265,6 @@ public class Http extends HttpBase { */ private static synchronized void setCredentials() throws ParserConfigurationException, SAXException, IOException { - if (authRulesRead) return; @@ -333,9 +331,9 @@ public class Http extends HttpBase { defaultScheme = scheme; if (LOG.isTraceEnabled()) { - LOG.trace("Credentials - username: " + username - + "; set as default" + " for realm: " + realm + "; scheme: " - + scheme); + LOG.trace( + "Credentials - username: " + username + "; set as default" + + " for realm: " + realm + "; scheme: " + scheme); } } else if ("authscope".equals(scopeElement.getTagName())) { @@ -378,11 +376,15 @@ public class Http extends HttpBase { /** * <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl" - * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field - * name="username" value="user1"/> </loginPostData> <additionalPostHeaders> - * <field name="header1" value="vaule1"/> </additionalPostHeaders> - * <removedFormFields> <field name="header1"/> </removedFormFields> - * </credentials> </auth-configuration> + * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field name + * ="username" value="user1"/> </loginPostData> + * <additionalPostHeaders> <field name="header1" value="vaule1"/> + * </additionalPostHeaders> + * <removedFormFields> <field name="header1"/> </removedFormFields> <!-- + * NUTCH-2280: Add <loginCookie> and it sub-node <policy> nodes into the + * <credentials> node. The <policy> will mark the POST login form cookie + * policy. The value could be CookiePolicy.<ConstantValues>. + * --> </credentials> </auth-configuration> */ private static HttpFormAuthConfigurer readFormAuthConfigurer( Element credElement, String authMethod) { @@ -407,6 +409,7 @@ public class Http extends HttpBase { } NodeList nodeList = credElement.getChildNodes(); + for (int j = 0; j < nodeList.getLength(); j++) { Node node = nodeList.item(j); if (!(node instanceof Element)) @@ -454,13 +457,28 @@ public class Http extends HttpBase { removedFormFields.add(name); } formConfigurer.setRemovedFormFields(removedFormFields); + } else if ("loginCookie".equals(element.getTagName())) { + // NUTCH-2280 + LOG.debug("start loginCookie"); + NodeList childNodes = element.getChildNodes(); + for (int k = 0; k < childNodes.getLength(); k++) { + Node fieldNode = childNodes.item(k); + if (!(fieldNode instanceof Element)) + continue; + Element fieldElement = (Element) fieldNode; + if ("policy".equals(fieldElement.getTagName())) { + String policy = fieldElement.getTextContent(); + formConfigurer.setCookiePolicy(policy); + LOG.debug("cookie policy is " + policy); + } + } } } return formConfigurer; } else { - throw new IllegalArgumentException("Unsupported authMethod: " - + authMethod); + throw new IllegalArgumentException( + "Unsupported authMethod: " + authMethod); } } @@ -510,8 +528,9 @@ public class Http extends HttpBase { } if (LOG.isTraceEnabled()) - LOG.trace("Pre-configured credentials with scope - host: " - + url.getHost() + "; port: " + port + "; not found for url: " + url); + LOG.trace( + "Pre-configured credentials with scope - host: " + url.getHost() + + "; port: " + port + "; not found for url: " + url); AuthScope serverAuthScope = getAuthScope(url.getHost(), port, defaultRealm, defaultScheme); http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java index b713ab6..f9cff36 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java @@ -33,8 +33,8 @@ public class HttpFormAuthConfigurer { */ private Map<String, String> additionalPostHeaders; /** - * If http post login returns redirect code: 301 or 302, - * Http Client will automatically follow the redirect. + * If http post login returns redirect code: 301 or 302, Http Client will + * automatically follow the redirect. */ private boolean loginRedirect; /** @@ -42,6 +42,12 @@ public class HttpFormAuthConfigurer { */ private Set<String> removedFormFields; + /** + * Use this cookie policy to set the HttpClient cookie policy. This value + * should be DEFAULT BROWSER_COMPATIBILITY NETSCAPE RFC_2109 + */ + private String cookiePolicy; + public HttpFormAuthConfigurer() { } @@ -102,5 +108,14 @@ public class HttpFormAuthConfigurer { public HttpFormAuthConfigurer setRemovedFormFields( Set<String> removedFormFields) { this.removedFormFields = removedFormFields; - return this; } + return this; + } + + public void setCookiePolicy(String policy) { + this.cookiePolicy = policy; + } + + public String getCookiePolicy() { + return this.cookiePolicy; + } } http://git-wip-us.apache.org/repos/asf/nutch/blob/993e997e/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java index 4c73f50..a6d4aa4 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java @@ -31,9 +31,12 @@ import java.util.Set; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.PostMethod; +import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.reflect.FieldUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -115,6 +118,11 @@ public class HttpFormAuthentication { // Entity enclosing requests cannot be redirected without user // intervention setLoginHeader(post); + + // NUTCH-2280 + LOGGER.debug("FormAuth: set cookie policy"); + this.setCookieParams(authConfigurer, post.getParams()); + post.addParameters(params.toArray(new NameValuePair[0])); int rspCode = client.executeMethod(post); if (LOGGER.isDebugEnabled()) { @@ -135,6 +143,26 @@ public class HttpFormAuthentication { } } } + + /** + * @throws NoSuchFieldException + * @throws SecurityException + * @throws IllegalArgumentException + * @throws IllegalAccessException + */ + private void setCookieParams(HttpFormAuthConfigurer formConfigurer, + HttpMethodParams params) + throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException { + // NUTCH-2280 - set the HttpClient cookie policy + if (formConfigurer.getCookiePolicy() != null) { + String policy = formConfigurer.getCookiePolicy(); + Object p = FieldUtils.readDeclaredStaticField(CookiePolicy.class, policy); + if(null != p) { + LOGGER.debug("reflection of cookie value: " + p.toString()); + params.setParameter(HttpMethodParams.COOKIE_POLICY, p); + } + } + } private void setLoginHeader(PostMethod post) { Map<String, String> headers = new HashMap<String, String>();
