fix for NUTCH-2191 contributed by karanjeets
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/3cda2229 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/3cda2229 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/3cda2229 Branch: refs/heads/master Commit: 3cda222971c970270dcc7525b97dfffe4b818ced Parents: 366104d Author: Karanjeet Singh <[email protected]> Authored: Mon Mar 28 22:58:40 2016 -0700 Committer: Karanjeet Singh <[email protected]> Committed: Mon Mar 28 22:58:40 2016 -0700 ---------------------------------------------------------------------- default.properties | 1 + .../protocol/htmlunit/HtmlUnitWebDriver.java | 125 +++++++++---------- .../htmlunit/HtmlUnitWebWindowListener.java | 53 ++++---- .../nutch/protocol/htmlunit/HttpResponse.java | 5 +- 4 files changed, 93 insertions(+), 91 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/default.properties ---------------------------------------------------------------------- diff --git a/default.properties b/default.properties index d34f778..aec5d51 100644 --- a/default.properties +++ b/default.properties @@ -90,6 +90,7 @@ plugins.protocol=\ org.apache.nutch.protocol.http*:\ org.apache.nutch.protocol.httpclient*:\ org.apache.nutch.protocol.selenium* + org.apache.nutch.protocol.htmlunit* # # URL Filter Plugins http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java index fc231c3..5e2c0ac 100644 --- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java +++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java @@ -51,79 +51,79 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver { private static int maxRedirects; public HtmlUnitWebDriver() { - super(enableJavascript); + super(enableJavascript); } @Override protected WebClient modifyWebClient(WebClient client) { - client.getOptions().setJavaScriptEnabled(enableJavascript); - client.getOptions().setCssEnabled(enableCss); - client.getOptions().setRedirectEnabled(enableRedirect); - if(enableJavascript) - client.setJavaScriptTimeout(javascriptTimeout); - client.getOptions().setThrowExceptionOnScriptError(false); - if(enableRedirect) - client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects)); - return client; + client.getOptions().setJavaScriptEnabled(enableJavascript); + client.getOptions().setCssEnabled(enableCss); + client.getOptions().setRedirectEnabled(enableRedirect); + if(enableJavascript) + client.setJavaScriptTimeout(javascriptTimeout); + client.getOptions().setThrowExceptionOnScriptError(false); + if(enableRedirect) + client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects)); + return client; } public static WebDriver getDriverForPage(String url, Configuration conf) { - long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3); - enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true); - enableCss = conf.getBoolean("htmlunit.enable.css", false); - javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500); - int redirects = Integer.parseInt(conf.get("http.redirect.max", "0")); - enableRedirect = redirects <= 0 ? false : true; - maxRedirects = redirects; + long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3); + enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true); + enableCss = conf.getBoolean("htmlunit.enable.css", false); + javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500); + int redirects = Integer.parseInt(conf.get("http.redirect.max", "0")); + enableRedirect = redirects <= 0 ? false : true; + maxRedirects = redirects; - WebDriver driver = null; + WebDriver driver = null; - try { - driver = new HtmlUnitWebDriver(); - driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS); - driver.get(url); - } catch(Exception e) { - if(e instanceof TimeoutException) { - LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far..."); - return driver; - } - cleanUpDriver(driver); - throw new RuntimeException(e); - } + try { + driver = new HtmlUnitWebDriver(); + driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS); + driver.get(url); + } catch(Exception e) { + if(e instanceof TimeoutException) { + LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far..."); + return driver; + } + cleanUpDriver(driver); + throw new RuntimeException(e); + } - return driver; + return driver; } public static String getHTMLContent(WebDriver driver, Configuration conf) { - try { - if (conf.getBoolean("htmlunit.take.screenshot", false)) - takeScreenshot(driver, conf); + try { + if (conf.getBoolean("htmlunit.take.screenshot", false)) + takeScreenshot(driver, conf); - String innerHtml = ""; - if(enableJavascript) { - WebElement body = driver.findElement(By.tagName("body")); - innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); - } - else - innerHtml = driver.getPageSource().replaceAll("&", "&"); - return innerHtml; - } catch(Exception e) { - TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); - cleanUpDriver(driver); - throw new RuntimeException(e); - } + String innerHtml = ""; + if(enableJavascript) { + WebElement body = driver.findElement(By.tagName("body")); + innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); + } + else + innerHtml = driver.getPageSource().replaceAll("&", "&"); + return innerHtml; + } catch(Exception e) { + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + cleanUpDriver(driver); + throw new RuntimeException(e); + } } public static void cleanUpDriver(WebDriver driver) { - if (driver != null) { - try { - driver.close(); - driver.quit(); - TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); - } catch (Exception e) { - throw new RuntimeException(e); - } + if (driver != null) { + try { + driver.close(); + driver.quit(); + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + } catch (Exception e) { + throw new RuntimeException(e); } + } } /** @@ -142,23 +142,22 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver { try { if (conf.getBoolean("htmlunit.take.screenshot", false)) - takeScreenshot(driver, conf); + takeScreenshot(driver, conf); - String innerHtml = ""; if(enableJavascript) { - WebElement body = driver.findElement(By.tagName("body")); - innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); + WebElement body = driver.findElement(By.tagName("body")); + innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); } else - innerHtml = driver.getPageSource().replaceAll("&", "&"); + innerHtml = driver.getPageSource().replaceAll("&", "&"); return innerHtml; } catch (Exception e) { - TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); - throw new RuntimeException(e); + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + throw new RuntimeException(e); } finally { - cleanUpDriver(driver); + cleanUpDriver(driver); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java index 760f4aa..baa8774 100644 --- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java +++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java @@ -5,32 +5,33 @@ import com.gargoylesoftware.htmlunit.WebWindowListener; public class HtmlUnitWebWindowListener implements WebWindowListener { - private Integer redirectCount = 0; - private Integer maxRedirects = 0; - - public HtmlUnitWebWindowListener() { - - } - - public HtmlUnitWebWindowListener(int maxRedirects) { - this.maxRedirects = maxRedirects; - } - - @Override - public void webWindowOpened(WebWindowEvent event) { - - } + private Integer redirectCount = 0; + private Integer maxRedirects = 0; + + public HtmlUnitWebWindowListener() { + + } + + public HtmlUnitWebWindowListener(int maxRedirects) { + this.maxRedirects = maxRedirects; + } + + @Override + public void webWindowOpened(WebWindowEvent event) { + + } - @Override - public void webWindowContentChanged(WebWindowEvent event) { - redirectCount++; - if(redirectCount > maxRedirects) - throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects); - } + @Override + public void webWindowContentChanged(WebWindowEvent event) { + redirectCount++; + if(redirectCount > maxRedirects) + throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects); + } - @Override - public void webWindowClosed(WebWindowEvent event) { - - } - + @Override + public void webWindowClosed(WebWindowEvent event) { + + } + } + http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java index 72b1fa1..a2f3b1e 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -344,7 +344,8 @@ public class HttpResponse implements Response { @Override public int getCode() { - // TODO Auto-generated method stub - return code; + // TODO Auto-generated method stub + return code; } } +
