fix for NUTCH-2191 contributed by karanjeets
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f5293599 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f5293599 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f5293599 Branch: refs/heads/master Commit: f52935994dec9468edf6087c5e11b3d9ed2517b1 Parents: 3cda222 Author: Karanjeet Singh <[email protected]> Authored: Tue Mar 29 01:47:10 2016 -0700 Committer: Karanjeet Singh <[email protected]> Committed: Tue Mar 29 01:47:10 2016 -0700 ---------------------------------------------------------------------- conf/nutch-default.xml | 96 ++++++-------------- default.properties | 4 +- .../protocol/htmlunit/HtmlUnitWebDriver.java | 42 ++++----- .../htmlunit/HtmlUnitWebWindowListener.java | 16 ++++ .../nutch/protocol/selenium/HttpWebClient.java | 37 ++++---- src/plugin/protocol-htmlunit/build.xml | 7 -- .../apache/nutch/protocol/htmlunit/Http.java | 4 - .../nutch/protocol/htmlunit/HttpResponse.java | 6 +- .../apache/nutch/protocol/htmlunit/package.html | 16 ++++ 9 files changed, 105 insertions(+), 123 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index a5f17bf..1934991 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1874,20 +1874,44 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> </description> </property> - -<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit --> +<!-- plugin properties that applies to lib-selenium, protocol-selenium, + protocol-interactiveselenium, lib-htmlunit, protocol-htmlunit --> <property> - <name>htmlunit.page.load.delay</name> + <name>page.load.delay</name> <value>3</value> <description> - The delay in seconds to use when loading a page with lib-htmlunit. This - setting is used by protocol-htmlunit since they depending on - lib-htmlunit for fetching. + The delay in seconds to use when loading a page with htmlunit or selenium. + </description> +</property> + +<property> + <name>take.screenshot</name> + <value>false</value> + <description> + Boolean property determining whether the protocol-htmlunit + WebDriver should capture a screenshot of the URL. If set to + true remember to define the 'screenshot.location' + property as this determines the location screenshots should be + persisted to on HDFS. If that property is not set, screenshots + are simply discarded. </description> </property> <property> + <name>screenshot.location</name> + <value></value> + <description> + The location on disk where a URL screenshot should be saved + to if the 'take.screenshot' property is set to true. + By default this is null, in this case screenshots held in memory + are simply discarded. + </description> +</property> + +<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit --> + +<property> <name>htmlunit.enable.javascript</name> <value>true</value> <description> @@ -1915,31 +1939,6 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> </description> </property> -<property> - <name>htmlunit.take.screenshot</name> - <value>false</value> - <description> - Boolean property determining whether the protocol-htmlunit - WebDriver should capture a screenshot of the URL. If set to - true remember to define the 'htmlunit.screenshot.location' - property as this determines the location screenshots should be - persisted to on HDFS. If that property is not set, screenshots - are simply discarded. - </description> -</property> - -<property> - <name>htmlunit.screenshot.location</name> - <value></value> - <description> - The location on disk where a URL screenshot should be saved - to if the 'htmlunit.take.screenshot' property is set to true. - By default this is null, in this case screenshots held in memory - are simply discarded. - </description> -</property> - - <!-- protocol-selenium plugin properties --> <property> @@ -1956,30 +1955,6 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> </property> <property> - <name>selenium.take.screenshot</name> - <value>false</value> - <description> - Boolean property determining whether the protocol-selenium - WebDriver should capture a screenshot of the URL. If set to - true remember to define the 'selenium.screenshot.location' - property as this determines the location screenshots should be - persisted to on HDFS. If that property is not set, screenshots - are simply discarded. - </description> -</property> - -<property> - <name>selenium.screenshot.location</name> - <value></value> - <description> - The location on disk where a URL screenshot should be saved - to if the 'selenium.take.screenshot' property is set to true. - By default this is null, in this case screenshots held in memory - are simply discarded. - </description> -</property> - -<property> <name>selenium.hub.port</name> <value>4444</value> <description>Selenium Hub Location connection port</description> @@ -2069,17 +2044,6 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> Currently this option exist for - 'firefox' </description> </property> -<!-- lib-selenium configuration --> -<property> - <name>libselenium.page.load.delay</name> - <value>3</value> - <description> - The delay in seconds to use when loading a page with lib-selenium. This - setting is used by protocol-selenium and protocol-interactiveselenium - since they depending on lib-selenium for fetching. - </description> -</property> - <!-- protocol-interactiveselenium configuration --> <property> <name>interactiveselenium.handlers</name> http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/default.properties ---------------------------------------------------------------------- diff --git a/default.properties b/default.properties index aec5d51..eb616c6 100644 --- a/default.properties +++ b/default.properties @@ -89,8 +89,8 @@ plugins.protocol=\ org.apache.nutch.protocol.ftp*:\ org.apache.nutch.protocol.http*:\ org.apache.nutch.protocol.httpclient*:\ - org.apache.nutch.protocol.selenium* - org.apache.nutch.protocol.htmlunit* + org.apache.nutch.protocol.selenium*:\ + org.apache.nutch.protocol.htmlunit*:\ # # URL Filter Plugins http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java index 5e2c0ac..064894e 100644 --- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java +++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java @@ -64,11 +64,11 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver { client.getOptions().setThrowExceptionOnScriptError(false); if(enableRedirect) client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects)); - return client; + return client; } public static WebDriver getDriverForPage(String url, Configuration conf) { - long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3); + long pageLoadTimout = conf.getLong("page.load.delay", 3); enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true); enableCss = conf.getBoolean("htmlunit.enable.css", false); javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500); @@ -84,8 +84,8 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver { driver.get(url); } catch(Exception e) { if(e instanceof TimeoutException) { - LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far..."); - return driver; + LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far..."); + return driver; } cleanUpDriver(driver); throw new RuntimeException(e); @@ -96,19 +96,19 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver { public static String getHTMLContent(WebDriver driver, Configuration conf) { try { - if (conf.getBoolean("htmlunit.take.screenshot", false)) - takeScreenshot(driver, conf); + if (conf.getBoolean("take.screenshot", false)) + takeScreenshot(driver, conf); String innerHtml = ""; if(enableJavascript) { - WebElement body = driver.findElement(By.tagName("body")); - innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); + WebElement body = driver.findElement(By.tagName("body")); + innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); } else - innerHtml = driver.getPageSource().replaceAll("&", "&"); + innerHtml = driver.getPageSource().replaceAll("&", "&"); return innerHtml; } catch(Exception e) { - TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); cleanUpDriver(driver); throw new RuntimeException(e); } @@ -141,23 +141,23 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver { WebDriver driver = getDriverForPage(url, conf); try { - if (conf.getBoolean("htmlunit.take.screenshot", false)) - takeScreenshot(driver, conf); + if (conf.getBoolean("take.screenshot", false)) + takeScreenshot(driver, conf); String innerHtml = ""; if(enableJavascript) { - WebElement body = driver.findElement(By.tagName("body")); - innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); + WebElement body = driver.findElement(By.tagName("body")); + innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); } else - innerHtml = driver.getPageSource().replaceAll("&", "&"); + innerHtml = driver.getPageSource().replaceAll("&", "&"); return innerHtml; } catch (Exception e) { - TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); - throw new RuntimeException(e); + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + throw new RuntimeException(e); } finally { - cleanUpDriver(driver); + cleanUpDriver(driver); } } @@ -167,8 +167,8 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver { File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); LOG.debug("In-memory screenshot taken of: {}", url); FileSystem fs = FileSystem.get(conf); - if (conf.get("htmlunit.screenshot.location") != null) { - Path screenshotPath = new Path(conf.get("htmlunit.screenshot.location") + "/" + srcFile.getName()); + if (conf.get("screenshot.location") != null) { + Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName()); OutputStream os = null; if (!fs.exists(screenshotPath)) { LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName()); @@ -179,7 +179,7 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver { LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); } else { LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " - + "'htmlunit.screenshot.location' is absent from nutch-site.xml.", url); + + "'screenshot.location' is absent from nutch-site.xml.", url); } } catch (Exception e) { cleanUpDriver(driver); http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java index baa8774..c2b88a6 100644 --- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java +++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.protocol.htmlunit; import com.gargoylesoftware.htmlunit.WebWindowEvent; http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java index 583b840..3a20cfe 100644 --- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java +++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java @@ -68,7 +68,7 @@ public class HttpWebClient { public static WebDriver getDriverForPage(String url, Configuration conf) { WebDriver driver = null; DesiredCapabilities capabilities = null; - long pageLoadWait = conf.getLong("libselenium.page.load.delay", 3); + long pageLoadWait = conf.getLong("page.load.delay", 3); try { String driverType = conf.get("selenium.driver", "firefox"); @@ -129,11 +129,11 @@ public class HttpWebClient { driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS); driver.get(url); } catch (Exception e) { - if(e instanceof TimeoutException) { - LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far..."); - return driver; - } - cleanUpDriver(driver); + if(e instanceof TimeoutException) { + LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far..."); + return driver; + } + cleanUpDriver(driver); throw new RuntimeException(e); } @@ -141,7 +141,7 @@ public class HttpWebClient { } public static String getHTMLContent(WebDriver driver, Configuration conf) { - if (conf.getBoolean("selenium.take.screenshot", false)) { + if (conf.getBoolean("take.screenshot", false)) { takeScreenshot(driver, conf); } @@ -149,15 +149,15 @@ public class HttpWebClient { } public static void cleanUpDriver(WebDriver driver) { - if (driver != null) { - try { + if (driver != null) { + try { driver.close(); - driver.quit(); - TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); - } catch (Exception e) { - throw new RuntimeException(e); - } + driver.quit(); + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + } catch (Exception e) { + throw new RuntimeException(e); } + } } /** @@ -175,7 +175,7 @@ public class HttpWebClient { WebDriver driver = getDriverForPage(url, conf); try { - if (conf.getBoolean("selenium.take.screenshot", false)) { + if (conf.getBoolean("take.screenshot", false)) { takeScreenshot(driver, conf); } @@ -201,8 +201,8 @@ public class HttpWebClient { File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); LOG.debug("In-memory screenshot taken of: {}", url); FileSystem fs = FileSystem.get(conf); - Path screenshotPath = new Path(conf.get("selenium.screenshot.location") + "/" + srcFile.getName()); - if (screenshotPath != null) { + if (conf.get("screenshot.location") != null) { + Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName()); OutputStream os = null; if (!fs.exists(screenshotPath)) { LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName()); @@ -213,9 +213,10 @@ public class HttpWebClient { LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); } else { LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " - + "'selenium.screenshot.location' is absent from nutch-site.xml.", url); + + "'screenshot.location' is absent from nutch-site.xml.", url); } } catch (Exception e) { + cleanUpDriver(driver); throw new RuntimeException(e); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml index 0ed0228..bf695fe 100644 --- a/src/plugin/protocol-htmlunit/build.xml +++ b/src/plugin/protocol-htmlunit/build.xml @@ -43,11 +43,4 @@ </copy> </target> - <!-- for junit test --> - <!-- - <mkdir dir="${build.test}/data" /> - <copy todir="${build.test}/data"> - <fileset dir="jsp"/> - </copy>--> - </project> http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java index 83b7687..c40ed69 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java @@ -30,10 +30,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * - * - */ public class Http extends HttpBase { public static final Logger LOG = LoggerFactory.getLogger(Http.class); http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java index a2f3b1e..7242f40 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -38,10 +38,6 @@ import org.apache.nutch.protocol.http.api.HttpException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * An HTTP response. - * - */ public class HttpResponse implements Response { private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class); @@ -61,7 +57,7 @@ public class HttpResponse implements Response { this.http = http; this.url = url; - LOG.info("fetching " + url); + LOG.info("fetching {}", url); String path = "".equals(url.getFile()) ? "/" : url.getFile(); http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html index 34d1d1c..4181951 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ <html> <body> <p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
