fix for NUTCH-2191 contributed by karanjeets

Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f5293599
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f5293599
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f5293599

Branch: refs/heads/master
Commit: f52935994dec9468edf6087c5e11b3d9ed2517b1
Parents: 3cda222
Author: Karanjeet Singh <[email protected]>
Authored: Tue Mar 29 01:47:10 2016 -0700
Committer: Karanjeet Singh <[email protected]>
Committed: Tue Mar 29 01:47:10 2016 -0700

----------------------------------------------------------------------
 conf/nutch-default.xml                          | 96 ++++++--------------
 default.properties                              |  4 +-
 .../protocol/htmlunit/HtmlUnitWebDriver.java    | 42 ++++-----
 .../htmlunit/HtmlUnitWebWindowListener.java     | 16 ++++
 .../nutch/protocol/selenium/HttpWebClient.java  | 37 ++++----
 src/plugin/protocol-htmlunit/build.xml          |  7 --
 .../apache/nutch/protocol/htmlunit/Http.java    |  4 -
 .../nutch/protocol/htmlunit/HttpResponse.java   |  6 +-
 .../apache/nutch/protocol/htmlunit/package.html | 16 ++++
 9 files changed, 105 insertions(+), 123 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a5f17bf..1934991 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1874,20 +1874,44 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
-
-<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit -->
+<!-- plugin properties that applies to lib-selenium, protocol-selenium,
+     protocol-interactiveselenium, lib-htmlunit, protocol-htmlunit -->
 
 <property>
-  <name>htmlunit.page.load.delay</name>
+  <name>page.load.delay</name>
   <value>3</value>
   <description>
-    The delay in seconds to use when loading a page with lib-htmlunit. This
-    setting is used by protocol-htmlunit since they depending on 
-    lib-htmlunit for fetching.
+    The delay in seconds to use when loading a page with htmlunit or selenium. 
+  </description>
+</property>
+
+<property>
+  <name>take.screenshot</name>
+  <value>false</value>
+  <description>
+    Boolean property determining whether the protocol-htmlunit
+    WebDriver should capture a screenshot of the URL. If set to
+    true remember to define the 'screenshot.location'
+    property as this determines the location screenshots should be
+    persisted to on HDFS. If that property is not set, screenshots
+    are simply discarded.
   </description>
 </property>
 
 <property>
+  <name>screenshot.location</name>
+  <value></value>
+  <description>
+    The location on disk where a URL screenshot should be saved
+    to if the 'take.screenshot' property is set to true.
+    By default this is null, in this case screenshots held in memory
+    are simply discarded.
+  </description>
+</property>
+
+<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit -->
+
+<property>
   <name>htmlunit.enable.javascript</name>
   <value>true</value>
   <description>
@@ -1915,31 +1939,6 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
-<property>
-  <name>htmlunit.take.screenshot</name>
-  <value>false</value>
-  <description>
-    Boolean property determining whether the protocol-htmlunit
-    WebDriver should capture a screenshot of the URL. If set to
-    true remember to define the 'htmlunit.screenshot.location' 
-    property as this determines the location screenshots should be 
-    persisted to on HDFS. If that property is not set, screenshots
-    are simply discarded.
-  </description>
-</property>
-
-<property>
-  <name>htmlunit.screenshot.location</name>
-  <value></value>
-  <description>
-    The location on disk where a URL screenshot should be saved
-    to if the 'htmlunit.take.screenshot' property is set to true.
-    By default this is null, in this case screenshots held in memory
-    are simply discarded.
-  </description>
-</property>
-
-
 <!-- protocol-selenium plugin properties -->
 
 <property>
@@ -1956,30 +1955,6 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 </property>
 
 <property>
-  <name>selenium.take.screenshot</name>
-  <value>false</value>
-  <description>
-    Boolean property determining whether the protocol-selenium
-    WebDriver should capture a screenshot of the URL. If set to
-    true remember to define the 'selenium.screenshot.location' 
-    property as this determines the location screenshots should be 
-    persisted to on HDFS. If that property is not set, screenshots
-    are simply discarded.
-  </description>
-</property>
-
-<property>
-  <name>selenium.screenshot.location</name>
-  <value></value>
-  <description>
-    The location on disk where a URL screenshot should be saved
-    to if the 'selenium.take.screenshot' property is set to true.
-    By default this is null, in this case screenshots held in memory
-    are simply discarded.
-  </description>
-</property>
-
-<property>
   <name>selenium.hub.port</name>
   <value>4444</value>
   <description>Selenium Hub Location connection port</description>
@@ -2069,17 +2044,6 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   Currently this option exist for - 'firefox' </description>
 </property>
 
-<!-- lib-selenium configuration -->
-<property>
-  <name>libselenium.page.load.delay</name>
-  <value>3</value>
-  <description>
-    The delay in seconds to use when loading a page with lib-selenium. This
-    setting is used by protocol-selenium and protocol-interactiveselenium
-    since they depending on lib-selenium for fetching.
-  </description>
-</property>
-
 <!-- protocol-interactiveselenium configuration -->
 <property>
   <name>interactiveselenium.handlers</name>

http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/default.properties
----------------------------------------------------------------------
diff --git a/default.properties b/default.properties
index aec5d51..eb616c6 100644
--- a/default.properties
+++ b/default.properties
@@ -89,8 +89,8 @@ plugins.protocol=\
    org.apache.nutch.protocol.ftp*:\
    org.apache.nutch.protocol.http*:\
    org.apache.nutch.protocol.httpclient*:\
-   org.apache.nutch.protocol.selenium*
-   org.apache.nutch.protocol.htmlunit*
+   org.apache.nutch.protocol.selenium*:\
+   org.apache.nutch.protocol.htmlunit*:\
 
 #
 # URL Filter Plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
 
b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
index 5e2c0ac..064894e 100644
--- 
a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
+++ 
b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
@@ -64,11 +64,11 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
       client.getOptions().setThrowExceptionOnScriptError(false);
       if(enableRedirect)
         client.addWebWindowListener(new 
HtmlUnitWebWindowListener(maxRedirects));
-       return client;
+         return client;
   }
   
   public static WebDriver getDriverForPage(String url, Configuration conf) {
-    long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
+    long pageLoadTimout = conf.getLong("page.load.delay", 3);
     enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
     enableCss = conf.getBoolean("htmlunit.enable.css", false);
     javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
@@ -84,8 +84,8 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
       driver.get(url);
      } catch(Exception e) {
        if(e instanceof TimeoutException) {
-        LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever 
loaded so far...");
-        return driver;
+              LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing 
whatever loaded so far...");
+              return driver;
      }
      cleanUpDriver(driver);
      throw new RuntimeException(e);
@@ -96,19 +96,19 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
 
   public static String getHTMLContent(WebDriver driver, Configuration conf) {
     try {
-      if (conf.getBoolean("htmlunit.take.screenshot", false))
-      takeScreenshot(driver, conf);
+      if (conf.getBoolean("take.screenshot", false))
+        takeScreenshot(driver, conf);
                  
       String innerHtml = "";
       if(enableJavascript) {
-       WebElement body = driver.findElement(By.tagName("body"));
-       innerHtml = (String)((JavascriptExecutor)driver).executeScript("return 
arguments[0].innerHTML;", body); 
+             WebElement body = driver.findElement(By.tagName("body"));
+             innerHtml = 
(String)((JavascriptExecutor)driver).executeScript("return 
arguments[0].innerHTML;", body); 
       }
       else
-       innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
+             innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
       return innerHtml;
     } catch(Exception e) {
-       TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+           TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
        cleanUpDriver(driver);
        throw new RuntimeException(e);
     } 
@@ -141,23 +141,23 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
     WebDriver driver = getDriverForPage(url, conf);
 
     try {
-      if (conf.getBoolean("htmlunit.take.screenshot", false))
-       takeScreenshot(driver, conf);
+      if (conf.getBoolean("take.screenshot", false))
+             takeScreenshot(driver, conf);
 
       String innerHtml = "";
       if(enableJavascript) {
-       WebElement body = driver.findElement(By.tagName("body"));
-       innerHtml = (String)((JavascriptExecutor)driver).executeScript("return 
arguments[0].innerHTML;", body); 
+             WebElement body = driver.findElement(By.tagName("body"));
+         innerHtml = 
(String)((JavascriptExecutor)driver).executeScript("return 
arguments[0].innerHTML;", body); 
       }
       else
-       innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
+         innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
       return innerHtml;
 
     } catch (Exception e) {
-       TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
-        throw new RuntimeException(e);
+           TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      throw new RuntimeException(e);
     } finally {
-        cleanUpDriver(driver);
+      cleanUpDriver(driver);
     }
   }
 
@@ -167,8 +167,8 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
       File srcFile = 
((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
       LOG.debug("In-memory screenshot taken of: {}", url);
       FileSystem fs = FileSystem.get(conf);
-      if (conf.get("htmlunit.screenshot.location") != null) {
-       Path screenshotPath = new Path(conf.get("htmlunit.screenshot.location") 
+ "/" + srcFile.getName());
+      if (conf.get("screenshot.location") != null) {
+         Path screenshotPath = new Path(conf.get("screenshot.location") + "/" 
+ srcFile.getName());
         OutputStream os = null;
         if (!fs.exists(screenshotPath)) {
           LOG.debug("No existing screenshot already exists... creating new 
file at {} {}.", screenshotPath, srcFile.getName());
@@ -179,7 +179,7 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
         LOG.debug("Screenshot for {} successfully saved to: {} {}", url, 
screenshotPath, srcFile.getName()); 
       } else {
         LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) 
as value for "
-            + "'htmlunit.screenshot.location' is absent from nutch-site.xml.", 
url);
+            + "'screenshot.location' is absent from nutch-site.xml.", url);
       }
     } catch (Exception e) {
        cleanUpDriver(driver);

http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
 
b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
index baa8774..c2b88a6 100644
--- 
a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
+++ 
b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.protocol.htmlunit;
 
 import com.gargoylesoftware.htmlunit.WebWindowEvent;

http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
 
b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
index 583b840..3a20cfe 100644
--- 
a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ 
b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -68,7 +68,7 @@ public class HttpWebClient {
   public static WebDriver getDriverForPage(String url, Configuration conf) {
       WebDriver driver = null;
       DesiredCapabilities capabilities = null;
-      long pageLoadWait = conf.getLong("libselenium.page.load.delay", 3);
+      long pageLoadWait = conf.getLong("page.load.delay", 3);
 
       try {
         String driverType  = conf.get("selenium.driver", "firefox");
@@ -129,11 +129,11 @@ public class HttpWebClient {
         driver.manage().timeouts().pageLoadTimeout(pageLoadWait, 
TimeUnit.SECONDS);
         driver.get(url);
       } catch (Exception e) {
-                       if(e instanceof TimeoutException) {
-                               LOG.debug("Selenium WebDriver: Timeout 
Exception: Capturing whatever loaded so far...");
-                               return driver;
-                       }
-                       cleanUpDriver(driver);
+                         if(e instanceof TimeoutException) {
+          LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever 
loaded so far...");
+          return driver;
+                         }
+                         cleanUpDriver(driver);
                    throw new RuntimeException(e);
            } 
 
@@ -141,7 +141,7 @@ public class HttpWebClient {
   }
 
   public static String getHTMLContent(WebDriver driver, Configuration conf) {
-      if (conf.getBoolean("selenium.take.screenshot", false)) {
+      if (conf.getBoolean("take.screenshot", false)) {
         takeScreenshot(driver, conf);
       }
 
@@ -149,15 +149,15 @@ public class HttpWebClient {
   }
 
   public static void cleanUpDriver(WebDriver driver) {
-      if (driver != null) {
-          try {
+    if (driver != null) {
+      try {
              driver.close();
-              driver.quit();
-              TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
-          } catch (Exception e) {
-              throw new RuntimeException(e);
-          }
+        driver.quit();
+        TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
       }
+    }
   }
 
   /**
@@ -175,7 +175,7 @@ public class HttpWebClient {
     WebDriver driver = getDriverForPage(url, conf);
 
     try {
-      if (conf.getBoolean("selenium.take.screenshot", false)) {
+      if (conf.getBoolean("take.screenshot", false)) {
         takeScreenshot(driver, conf);
       }
 
@@ -201,8 +201,8 @@ public class HttpWebClient {
       File srcFile = 
((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
       LOG.debug("In-memory screenshot taken of: {}", url);
       FileSystem fs = FileSystem.get(conf);
-      Path screenshotPath = new Path(conf.get("selenium.screenshot.location") 
+ "/" + srcFile.getName());
-      if (screenshotPath != null) {
+      if (conf.get("screenshot.location") != null) {
+        Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + 
srcFile.getName());
         OutputStream os = null;
         if (!fs.exists(screenshotPath)) {
           LOG.debug("No existing screenshot already exists... creating new 
file at {} {}.", screenshotPath, srcFile.getName());
@@ -213,9 +213,10 @@ public class HttpWebClient {
         LOG.debug("Screenshot for {} successfully saved to: {} {}", url, 
screenshotPath, srcFile.getName()); 
       } else {
         LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) 
as value for "
-            + "'selenium.screenshot.location' is absent from nutch-site.xml.", 
url);
+            + "'screenshot.location' is absent from nutch-site.xml.", url);
       }
     } catch (Exception e) {
+      cleanUpDriver(driver);
       throw new RuntimeException(e);
     }
   }

http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/build.xml 
b/src/plugin/protocol-htmlunit/build.xml
index 0ed0228..bf695fe 100644
--- a/src/plugin/protocol-htmlunit/build.xml
+++ b/src/plugin/protocol-htmlunit/build.xml
@@ -43,11 +43,4 @@
     </copy>
   </target>
 
-  <!-- for junit test -->
-  <!--
-  <mkdir dir="${build.test}/data" />
-  <copy todir="${build.test}/data">
-    <fileset dir="jsp"/>
-  </copy>-->
-
 </project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
index 83b7687..c40ed69 100644
--- 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -30,10 +30,6 @@ import org.apache.nutch.util.NutchConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-/**
- *
- *
- */
 public class Http extends HttpBase {
 
   public static final Logger LOG = LoggerFactory.getLogger(Http.class);

http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index a2f3b1e..7242f40 100644
--- 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -38,10 +38,6 @@ import org.apache.nutch.protocol.http.api.HttpException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-/**
- * An HTTP response.
- *
- */
 public class HttpResponse implements Response {
 
   private static final Logger LOG = 
LoggerFactory.getLogger(HttpResponse.class);
@@ -61,7 +57,7 @@ public class HttpResponse implements Response {
     this.http = http;
     this.url = url;
 
-    LOG.info("fetching " + url);
+    LOG.info("fetching {}", url);
     
     String path = "".equals(url.getFile()) ? "/" : url.getFile();
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
index 34d1d1c..4181951 100644
--- 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 <html>
 <body>
 <p>Protocol plugin which supports retrieving documents via the http 
protocol.</p><p></p>

Reply via email to