Repository: nutch
Updated Branches:
  refs/heads/master d6bcefd92 -> 044e8e77e


fix for NUTCH-2191 contributed by karanjeets


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/fa334722
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/fa334722
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/fa334722

Branch: refs/heads/master
Commit: fa33472297aca6a6468461bb6945225c93590d6d
Parents: a9b2491
Author: Karanjeet Singh <[email protected]>
Authored: Sat Mar 26 23:21:28 2016 -0700
Committer: Karanjeet Singh <[email protected]>
Committed: Sat Mar 26 23:21:28 2016 -0700

----------------------------------------------------------------------
 build.xml                                       |   6 +
 conf/nutch-default.xml                          |  66 ++++
 src/plugin/build.xml                            |   4 +
 src/plugin/lib-htmlunit/build-ivy.xml           |  54 +++
 src/plugin/lib-htmlunit/build.xml               |  28 ++
 src/plugin/lib-htmlunit/ivy.xml                 |  52 +++
 src/plugin/lib-htmlunit/plugin.xml              | 166 +++++++++
 .../protocol/htmlunit/HtmlUnitWebDriver.java    | 190 ++++++++++
 .../htmlunit/HtmlUnitWebWindowListener.java     |  36 ++
 src/plugin/protocol-htmlunit/build.xml          |  53 +++
 src/plugin/protocol-htmlunit/ivy.xml            |  38 ++
 src/plugin/protocol-htmlunit/plugin.xml         |  51 +++
 .../apache/nutch/protocol/htmlunit/Http.java    |  67 ++++
 .../nutch/protocol/htmlunit/HttpResponse.java   | 350 +++++++++++++++++++
 .../apache/nutch/protocol/htmlunit/package.html |   5 +
 15 files changed, 1166 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index f8aa196..5cff1ea 100644
--- a/build.xml
+++ b/build.xml
@@ -189,6 +189,7 @@
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
+      <packageset dir="${plugins.dir}/lib-htmlunit/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
       <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
@@ -203,6 +204,7 @@
       <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/protocol-file/src/java"/>
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-htmlunit/src/java"/>
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
       <packageset dir="${plugins.dir}/protocol-interactiveselenium/src/java"/>
@@ -629,6 +631,7 @@
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
+      <packageset dir="${plugins.dir}/lib-htmlunit/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
       <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
@@ -643,6 +646,7 @@
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
       <packageset dir="${plugins.dir}/protocol-file/src/java"/>
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-htmlunit/src/java"/>
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
       <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
@@ -1033,6 +1037,7 @@
         <source path="${plugins.dir}/index-static/src/test/" />
         <source path="${plugins.dir}/language-identifier/src/java/" />
         <source path="${plugins.dir}/language-identifier/src/test/" />
+        <source path="${plugins.dir}/lib-htmlunit/src/java/" />
         <source path="${plugins.dir}/lib-http/src/java/" />
         <source path="${plugins.dir}/lib-http/src/test/" />
         <source path="${plugins.dir}/lib-selenium/src/java/" />
@@ -1057,6 +1062,7 @@
         <source path="${plugins.dir}/protocol-file/src/java/" />
         <source path="${plugins.dir}/protocol-file/src/test/" />
         <source path="${plugins.dir}/protocol-ftp/src/java/" />
+        <source path="${plugins.dir}/protocol-htmlunit/src/java"/>
         <source path="${plugins.dir}/protocol-httpclient/src/java/" />
         <source path="${plugins.dir}/protocol-httpclient/src/test/" />
         <source path="${plugins.dir}/protocol-http/src/java/" />

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 93503f3..a5f17bf 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1874,6 +1874,72 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
+
+<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit -->
+
+<property>
+  <name>htmlunit.page.load.delay</name>
+  <value>3</value>
+  <description>
+    The delay in seconds to use when loading a page with lib-htmlunit. This
+    setting is used by protocol-htmlunit since they depending on 
+    lib-htmlunit for fetching.
+  </description>
+</property>
+
+<property>
+  <name>htmlunit.enable.javascript</name>
+  <value>true</value>
+  <description>
+    A Boolean value representing if javascript should
+    be enabled or disabled when using htmlunit. The default value is enabled. 
+  </description>
+</property>
+
+<property>
+  <name>htmlunit.javascript.timeout</name>
+  <value>3500</value>
+  <description>
+    The timeout in milliseconds when loading javascript with lib-htmlunit. This
+    setting is used by protocol-htmlunit since they depending on 
+    lib-htmlunit for fetching.
+  </description>
+</property>
+
+<property>
+  <name>htmlunit.enable.css</name>
+  <value>false</value>
+  <description>
+    A Boolean value representing if CSS should
+    be enabled or disabled when using htmlunit. The default value is disabled.
+  </description>
+</property>
+
+<property>
+  <name>htmlunit.take.screenshot</name>
+  <value>false</value>
+  <description>
+    Boolean property determining whether the protocol-htmlunit
+    WebDriver should capture a screenshot of the URL. If set to
+    true remember to define the 'htmlunit.screenshot.location' 
+    property as this determines the location screenshots should be 
+    persisted to on HDFS. If that property is not set, screenshots
+    are simply discarded.
+  </description>
+</property>
+
+<property>
+  <name>htmlunit.screenshot.location</name>
+  <value></value>
+  <description>
+    The location on disk where a URL screenshot should be saved
+    to if the 'htmlunit.take.screenshot' property is set to true.
+    By default this is null, in this case screenshots held in memory
+    are simply discarded.
+  </description>
+</property>
+
+
 <!-- protocol-selenium plugin properties -->
 
 <property>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 10731b3..75ae2e7 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -53,6 +53,8 @@
      <ant dir="protocol-ftp" target="deploy"/>
      <ant dir="protocol-http" target="deploy"/>
      <ant dir="protocol-httpclient" target="deploy"/>
+     <ant dir="lib-htmlunit" target="deploy"/>
+     <ant dir="protocol-htmlunit" target="deploy" />
      <ant dir="lib-selenium" target="deploy"/>
      <ant dir="protocol-selenium" target="deploy" />
      <ant dir="protocol-interactiveselenium" target="deploy" />
@@ -170,6 +172,8 @@
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
     <ant dir="protocol-httpclient" target="clean"/>
+    <ant dir="lib-htmlunit" target="clean"/>
+    <ant dir="protocol-htmlunit" target="clean" />
     <ant dir="lib-selenium" target="clean"/>
     <ant dir="protocol-selenium" target="clean" />
     <ant dir="protocol-interactiveselenium" target="clean" />

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/build-ivy.xml 
b/src/plugin/lib-htmlunit/build-ivy.xml
new file mode 100644
index 0000000..7022f4e
--- /dev/null
+++ b/src/plugin/lib-htmlunit/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-htmlunit" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
+        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
+              it into ant's lib dir (note that the latter copy will always 
take precedence).
+              We will not fail as long as local lib dir exists (it may be 
empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/build.xml 
b/src/plugin/lib-htmlunit/build.xml
new file mode 100644
index 0000000..14f5d8f
--- /dev/null
+++ b/src/plugin/lib-htmlunit/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-htmlunit" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">    
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+  </path>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
new file mode 100644
index 0000000..6430535
--- /dev/null
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" 
rev="2.44.0" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/plugin.xml 
b/src/plugin/lib-htmlunit/plugin.xml
new file mode 100644
index 0000000..290a137
--- /dev/null
+++ b/src/plugin/lib-htmlunit/plugin.xml
@@ -0,0 +1,166 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+   id="lib-htmlunit"
+   name="HTTP Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-htmlunit.jar">
+        <export name="*"/>
+     </library>
+     <!-- all classes from dependent libraries are exported -->
+     <library name="cglib-nodep-2.1_3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-codec-1.9.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-collections-3.2.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-exec-1.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-io-2.4.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-jxpath-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-lang3-3.3.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-logging-1.1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="cssparser-0.9.14.jar">
+       <export name="*"/>
+     </library>
+     <library name="gson-2.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="guava-18.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="htmlunit-2.15.jar">
+       <export name="*"/>
+     </library>
+     <library name="htmlunit-core-js-2.15.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpclient-4.3.4.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpcore-4.3.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpmime-4.3.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="ini4j-0.5.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-http-8.1.15.v20140411.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-io-8.1.15.v20140411.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-util-8.1.15.v20140411.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-websocket-8.1.15.v20140411.jar">
+       <export name="*"/>
+     </library>
+     <library name="jna-3.4.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="nekohtml-1.9.21.jar">
+       <export name="*"/>
+     </library>
+     <library name="netty-3.5.2.Final.jar">
+       <export name="*"/>
+     </library>
+     <library name="operadriver-1.5.jar">
+       <export name="*"/>
+     </library>
+     <library name="operalaunchers-1.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="phantomjsdriver-1.2.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="platform-3.4.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="protobuf-java-2.4.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="sac-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-api-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-chrome-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-firefox-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-htmlunit-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-ie-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-java-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-remote-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-safari-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-support-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="serializer-2.7.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="webbit-0.4.14.jar">
+       <export name="*"/>
+     </library>
+     <library name="xalan-2.7.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="xercesImpl-2.11.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="xml-apis-1.4.01.jar">
+       <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
 
b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
new file mode 100644
index 0000000..fc231c3
--- /dev/null
+++ 
b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.openqa.selenium.By;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
+import org.openqa.selenium.TimeoutException;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.htmlunit.HtmlUnitDriver;
+import org.openqa.selenium.io.TemporaryFilesystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.gargoylesoftware.htmlunit.WebClient;
+
+public class HtmlUnitWebDriver extends HtmlUnitDriver {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(HtmlUnitWebDriver.class);
+  private static boolean enableJavascript;
+  private static boolean enableCss;
+  private static boolean enableRedirect;
+  private static long javascriptTimeout;
+  private static int maxRedirects;
+  
+  public HtmlUnitWebDriver() {
+       super(enableJavascript);
+  }
+  
+  @Override
+  protected WebClient modifyWebClient(WebClient client) {
+         client.getOptions().setJavaScriptEnabled(enableJavascript);
+         client.getOptions().setCssEnabled(enableCss);
+         client.getOptions().setRedirectEnabled(enableRedirect);
+         if(enableJavascript)
+                 client.setJavaScriptTimeout(javascriptTimeout);
+         client.getOptions().setThrowExceptionOnScriptError(false);
+         if(enableRedirect)
+                 client.addWebWindowListener(new 
HtmlUnitWebWindowListener(maxRedirects));
+         return client;
+  }
+  
+  public static WebDriver getDriverForPage(String url, Configuration conf) {
+         long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
+         enableJavascript = conf.getBoolean("htmlunit.enable.javascript", 
true);
+         enableCss = conf.getBoolean("htmlunit.enable.css", false);
+         javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
+         int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
+         enableRedirect = redirects <= 0 ? false : true;
+         maxRedirects = redirects;
+         
+         WebDriver driver = null;
+         
+         try {
+                 driver = new HtmlUnitWebDriver();
+                 driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, 
TimeUnit.SECONDS);
+                 driver.get(url);
+         } catch(Exception e) {
+                 if(e instanceof TimeoutException) {
+                               LOG.debug("HtmlUnit WebDriver: Timeout 
Exception: Capturing whatever loaded so far...");
+                               return driver;
+                       }
+                       cleanUpDriver(driver);
+                   throw new RuntimeException(e);
+         }
+
+      return driver;
+  }
+
+  public static String getHTMLContent(WebDriver driver, Configuration conf) {
+      try {
+                 if (conf.getBoolean("htmlunit.take.screenshot", false))
+                 takeScreenshot(driver, conf);
+                 
+                 String innerHtml = "";
+             if(enableJavascript) {
+                 WebElement body = driver.findElement(By.tagName("body"));
+                 innerHtml = 
(String)((JavascriptExecutor)driver).executeScript("return 
arguments[0].innerHTML;", body); 
+             }
+             else
+                 innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
+             return innerHtml;
+      } catch(Exception e) {
+         TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+         cleanUpDriver(driver);
+         throw new RuntimeException(e);
+      } 
+  }
+
+  public static void cleanUpDriver(WebDriver driver) {
+      if (driver != null) {
+          try {
+                 driver.close();
+              driver.quit();
+              TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+          } catch (Exception e) {
+              throw new RuntimeException(e);
+          }
+      }
+  }
+
+  /**
+   * Function for obtaining the HTML BODY using the selected
+   * {@link org.openqa.selenium.WebDriver}.
+   * There are a number of configuration properties within
+   * <code>nutch-site.xml</code> which determine whether to
+   * take screenshots of the rendered pages and persist them
+   * as timestamped .png's into HDFS.
+   * @param url the URL to fetch and render
+   * @param conf the {@link org.apache.hadoop.conf.Configuration}
+   * @return the rendered inner HTML page
+   */
+  public static String getHtmlPage(String url, Configuration conf) {
+    WebDriver driver = getDriverForPage(url, conf);
+
+    try {
+      if (conf.getBoolean("htmlunit.take.screenshot", false))
+         takeScreenshot(driver, conf);
+
+      
+      String innerHtml = "";
+      if(enableJavascript) {
+         WebElement body = driver.findElement(By.tagName("body"));
+         innerHtml = 
(String)((JavascriptExecutor)driver).executeScript("return 
arguments[0].innerHTML;", body); 
+      }
+      else
+         innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
+      return innerHtml;
+
+    } catch (Exception e) {
+             TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+             throw new RuntimeException(e);
+    } finally {
+       cleanUpDriver(driver);
+    }
+  }
+
+  private static void takeScreenshot(WebDriver driver, Configuration conf) {
+    try {
+      String url = driver.getCurrentUrl();
+      File srcFile = 
((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+      LOG.debug("In-memory screenshot taken of: {}", url);
+      FileSystem fs = FileSystem.get(conf);
+      if (conf.get("htmlunit.screenshot.location") != null) {
+       Path screenshotPath = new Path(conf.get("htmlunit.screenshot.location") 
+ "/" + srcFile.getName());
+        OutputStream os = null;
+        if (!fs.exists(screenshotPath)) {
+          LOG.debug("No existing screenshot already exists... creating new 
file at {} {}.", screenshotPath, srcFile.getName());
+          os = fs.create(screenshotPath);
+        }
+        InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
+        IOUtils.copyBytes(is, os, conf);
+        LOG.debug("Screenshot for {} successfully saved to: {} {}", url, 
screenshotPath, srcFile.getName()); 
+      } else {
+        LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) 
as value for "
+            + "'htmlunit.screenshot.location' is absent from nutch-site.xml.", 
url);
+      }
+    } catch (Exception e) {
+       cleanUpDriver(driver);
+       throw new RuntimeException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
 
b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
new file mode 100644
index 0000000..760f4aa
--- /dev/null
+++ 
b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
@@ -0,0 +1,36 @@
+package org.apache.nutch.protocol.htmlunit;
+
+import com.gargoylesoftware.htmlunit.WebWindowEvent;
+import com.gargoylesoftware.htmlunit.WebWindowListener;
+
+public class HtmlUnitWebWindowListener implements WebWindowListener {
+
+       private Integer redirectCount = 0;
+       private Integer maxRedirects = 0;
+       
+       public HtmlUnitWebWindowListener() {
+               
+       }
+       
+       public HtmlUnitWebWindowListener(int maxRedirects) {
+               this.maxRedirects = maxRedirects;
+       }
+       
+       @Override
+       public void webWindowOpened(WebWindowEvent event) {
+               
+       }
+
+       @Override
+       public void webWindowContentChanged(WebWindowEvent event) {
+               redirectCount++;
+               if(redirectCount > maxRedirects)
+                       throw new RuntimeException("Redirect Count: " + 
redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
+       }
+
+       @Override
+       public void webWindowClosed(WebWindowEvent event) {
+               
+       }
+       
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/build.xml 
b/src/plugin/protocol-htmlunit/build.xml
new file mode 100644
index 0000000..0ed0228
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/build.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-htmlunit" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-htmlunit/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-http"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <!--
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+    <fileset dir="jsp"/>
+  </copy>-->
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/ivy.xml 
b/src/plugin/protocol-htmlunit/ivy.xml
new file mode 100644
index 0000000..8aa78d2
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/ivy.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/plugin.xml 
b/src/plugin/protocol-htmlunit/plugin.xml
new file mode 100644
index 0000000..36bcb80
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-htmlunit"
+   name="HtmlUnit Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="protocol-htmlunit.jar">
+      <export name="*"/>
+    </library>
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints"/>
+    <import plugin="lib-http"/>
+    <import plugin="lib-htmlunit"/>
+  </requires>
+
+  <extension id="org.apache.nutch.protocol.http"
+             name="HttpProtocol"
+             point="org.apache.nutch.protocol.Protocol">
+
+    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+                    class="org.apache.nutch.protocol.htmlunit.Http">
+      <parameter name="protocolName" value="http"/>
+    </implementation>
+      
+    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+                    class="org.apache.nutch.protocol.htmlunit.Http">
+      <parameter name="protocolName" value="https"/>
+    </implementation>
+
+   </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
new file mode 100644
index 0000000..83b7687
--- /dev/null
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.IOException;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ *
+ */
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  /**
+   * Default constructor.
+   */
+  public Http() {
+    super(LOG);
+  }
+
+  /**
+   * Set the {@link org.apache.hadoop.conf.Configuration} object.
+   * 
+   * @param conf
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+  
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
new file mode 100644
index 0000000..72b1fa1
--- /dev/null
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -0,0 +1,350 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An HTTP response.
+ *
+ */
+public class HttpResponse implements Response {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(HttpResponse.class);
+
+  private Http http;
+  private URL url;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws 
ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+
+    LOG.info("fetching " + url);
+    
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      // TODO: Write code for Https
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + 
HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new 
BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || 
contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + 
contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > 
http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      } 
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+  
+  private void readPlainContent(URL url) throws IOException {
+    String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
+    content = page.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) 
throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + 
e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, 
HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws 
IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = 
line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, 
boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {   
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  @Override
+  public int getCode() {
+       // TODO Auto-generated method stub
+       return code;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
new file mode 100644
index 0000000..34d1d1c
--- /dev/null
+++ 
b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the http 
protocol.</p><p></p>
+</body>
+</html>

Reply via email to