Author: lewismc
Date: Thu Feb 26 18:31:39 2015
New Revision: 1662530
URL: http://svn.apache.org/r1662530
Log:
NUTCH-1933 nutch-selenium plugin
Added:
nutch/trunk/src/plugin/lib-selenium/
nutch/trunk/src/plugin/lib-selenium/build.xml
nutch/trunk/src/plugin/lib-selenium/ivy.xml
nutch/trunk/src/plugin/lib-selenium/plugin.xml
nutch/trunk/src/plugin/lib-selenium/src/
nutch/trunk/src/plugin/lib-selenium/src/java/
nutch/trunk/src/plugin/lib-selenium/src/java/org/
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
nutch/trunk/src/plugin/protocol-selenium/
nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
nutch/trunk/src/plugin/protocol-selenium/build.xml
nutch/trunk/src/plugin/protocol-selenium/ivy.xml
nutch/trunk/src/plugin/protocol-selenium/plugin.xml
nutch/trunk/src/plugin/protocol-selenium/src/
nutch/trunk/src/plugin/protocol-selenium/src/java/
nutch/trunk/src/plugin/protocol-selenium/src/java/org/
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
nutch/trunk/src/plugin/protocol-selenium/src/target/
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/plugin/build.xml
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 26 18:31:39 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc)
+
* NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel,
lewismc)
* NUTCH-1724 LinkDBReader to support regex output filtering (markus)
Modified: nutch/trunk/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Thu Feb 26 18:31:39 2015
@@ -184,6 +184,7 @@
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
+ <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
<packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
<packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
<packageset dir="${plugins.dir}/parse-ext/src/java"/>
@@ -197,6 +198,7 @@
<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
<packageset dir="${plugins.dir}/protocol-http/src/java"/>
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+ <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
<packageset dir="${plugins.dir}/scoring-depth/src/java"/>
<packageset dir="${plugins.dir}/scoring-link/src/java"/>
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
@@ -591,6 +593,7 @@
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
+ <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
<packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
<packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
<packageset dir="${plugins.dir}/parse-ext/src/java"/>
@@ -604,6 +607,7 @@
<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
<packageset dir="${plugins.dir}/protocol-http/src/java"/>
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+ <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
<packageset dir="${plugins.dir}/scoring-depth/src/java"/>
<packageset dir="${plugins.dir}/scoring-link/src/java"/>
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
@@ -985,6 +989,8 @@
<source path="${plugins.dir}/language-identifier/src/test/" />
<source path="${plugins.dir}/lib-http/src/java/" />
<source path="${plugins.dir}/lib-http/src/test/" />
+ <source path="${plugins.dir}/lib-selenium/src/java/" />
+ <source path="${plugins.dir}/lib-selenium/src/test/" />
<source path="${plugins.dir}/lib-regex-filter/src/java/" />
<source path="${plugins.dir}/lib-regex-filter/src/test/" />
<source path="${plugins.dir}/microformats-reltag/src/java/" />
@@ -1008,6 +1014,8 @@
<source path="${plugins.dir}/protocol-httpclient/src/test/" />
<source path="${plugins.dir}/protocol-http/src/java/" />
<source path="${plugins.dir}/protocol-http/src/test/" />
+ <source path="${plugins.dir}/protocol-selenium/src/java"/>
+ <source path="${plugins.dir}/protocol-selenium/src/test"/>
<source path="${plugins.dir}/scoring-depth/src/java/" />
<source path="${plugins.dir}/scoring-link/src/java/" />
<source path="${plugins.dir}/scoring-opic/src/java/" />
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Thu Feb 26 18:31:39 2015
@@ -23,24 +23,24 @@
database etc.
</description>
</info>
-
+
<configurations>
<include file="${basedir}/ivy/ivy-configurations.xml" />
</configurations>
-
+
<publications>
<!--get the artifact from our module name -->
<artifact conf="master" />
</publications>
-
+
<dependencies>
<dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
conf="*->master" />
<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
conf="*->master" />
-
+
<dependency org="log4j" name="log4j" rev="1.2.15"
conf="*->master" />
-
+
<dependency org="commons-lang" name="commons-lang" rev="2.6"
conf="*->default" />
<dependency org="commons-collections" name="commons-collections"
@@ -49,7 +49,7 @@
rev="3.1" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.3"
conf="*->default" />
-
+
<dependency org="org.apache.hadoop" name="hadoop-core"
rev="1.2.0"
conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
Modified: nutch/trunk/src/plugin/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Feb 26 18:31:39 2015
@@ -50,6 +50,8 @@
<ant dir="protocol-ftp" target="deploy"/>
<ant dir="protocol-http" target="deploy"/>
<ant dir="protocol-httpclient" target="deploy"/>
+ <ant dir="lib-selenium" target="deploy"/>
+ <ant dir="protocol-selenium" target="deploy" />
<ant dir="parse-ext" target="deploy"/>
<ant dir="parse-js" target="deploy"/>
<ant dir="parse-html" target="deploy"/>
@@ -149,6 +151,8 @@
<ant dir="protocol-ftp" target="clean"/>
<ant dir="protocol-http" target="clean"/>
<ant dir="protocol-httpclient" target="clean"/>
+ <ant dir="lib-selenium" target="clean"/>
+ <ant dir="protocol-selenium" target="clean" />
<ant dir="parse-ext" target="clean"/>
<ant dir="parse-js" target="clean"/>
<ant dir="parse-html" target="clean"/>
Added: nutch/trunk/src/plugin/lib-selenium/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/build.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/build.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-http/*.jar" />
+ </fileset>
+ </path>
+</project>
Added: nutch/trunk/src/plugin/lib-selenium/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <!-- begin selenium dependencies -->
+ <dependency org="org.seleniumhq.selenium" name="selenium-java"
rev="2.44.0" />
+
+ <dependency org="com.opera" name="operadriver" rev="1.5">
+ <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+ </dependency>
+ <!-- end selenium dependencies -->
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/lib-selenium/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+ id="lib-selenium"
+ name="HTTP Framework"
+ version="1.0"
+ provider-name="org.apache.nutch">
+
+ <runtime>
+ <library name="lib-selenium.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <library name="selenium-java-2.4.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="operadriver-1.5.jar">
+ <export name="*"/>
+ <exclude name="selenium-remote-driver" />
+ </library>
+ </requires>
+</plugin>
Added:
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1662530&view=auto
==============================================================================
---
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
(added)
+++
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
Thu Feb 26 18:31:39 2015
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.support.ui.WebDriverWait;
+
+import java.lang.String;
+
+public class HttpWebClient {
+
+ private static final Logger LOG =
LoggerFactory.getLogger("org.apache.nutch.protocol");
+
+ public static ThreadLocal<WebDriver> threadWebDriver = new
ThreadLocal<WebDriver>() {
+
+ @Override
+ protected WebDriver initialValue()
+ {
+ FirefoxProfile profile = new FirefoxProfile();
+ profile.setPreference("permissions.default.stylesheet", 2);
+ profile.setPreference("permissions.default.image", 2);
+ profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so",
"false");
+ WebDriver driver = new FirefoxDriver(profile);
+ return driver;
+ };
+ };
+
+ public static String getHtmlPage(String url, Configuration conf) {
+ WebDriver driver = null;
+
+ try {
+ driver = new FirefoxDriver();
+ //} WebDriver driver = threadWebDriver.get();
+ // if (driver == null) {
+ // driver = new FirefoxDriver();
+ // }
+
+ driver.get(url);
+
+ // Wait for the page to load, timeout after 3 seconds
+ new WebDriverWait(driver, 3);
+
+ String innerHtml =
driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+
+ return innerHtml;
+
+ // I'm sure this catch statement is a code smell ; borrowing it from
lib-htmlunit
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ } finally {
+ if (driver != null) try { driver.quit(); } catch (Exception e) { throw
new RuntimeException(e); }
+ }
+ };
+
+ public static String getHtmlPage(String url) {
+ return getHtmlPage(url, null);
+ }
+}
\ No newline at end of file
Added: nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml Thu Feb 26 18:31:39
2015
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+ <property name="ivy.install.version" value="2.1.0" />
+ <condition property="ivy.home" value="${env.IVY_HOME}">
+ <isset property="env.IVY_HOME" />
+ </condition>
+ <property name="ivy.home" value="${user.home}/.ant" />
+ <property name="ivy.checksums" value="" />
+ <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+ <target name="download-ivy" unless="offline">
+
+ <mkdir dir="${ivy.jar.dir}"/>
+ <!-- download Ivy from web site so that it can be used even without
any special installation -->
+ <get
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true"/>
+ </target>
+
+ <target name="init-ivy" depends="download-ivy">
+ <!-- try to load ivy here from ivy home, in case the user has not
already dropped
+ it into ant's lib dir (note that the latter copy will always
take precedence).
+ We will not fail as long as local lib dir exists (it may be
empty) and
+ ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml"
+ uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+ </target>
+
+ <target name="deps-jar" depends="init-ivy">
+ <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+ </target>
+
+</project>
Added: nutch/trunk/src/plugin/protocol-selenium/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/build.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/build.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-http"/>
+ <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-http/*.jar" />
+ <include name="**/lib-selenium/*.jar" />
+ </fileset>
+ </path>
+
+</project>
Added: nutch/trunk/src/plugin/protocol-selenium/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="default"/>
+ </publications>
+
+ <dependencies>
+ <!-- begin selenium dependencies -->
+ <dependency org="org.seleniumhq.selenium" name="selenium-java"
rev="2.44.0" />
+
+ <dependency org="com.opera" name="operadriver" rev="1.5">
+ <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+ </dependency>
+ <!-- end selenium dependencies -->
+ </dependencies>
+
+</ivy-module>
Added: nutch/trunk/src/plugin/protocol-selenium/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/plugin.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/plugin.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="protocol-selenium"
+ name="Http Protocol Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="protocol-selenium.jar">
+ <export name="*"/>
+ </library>
+ <library name="cglib-nodep-2.1_3.jar"/>
+ <library name="commons-codec-1.9.jar"/>
+ <library name="commons-collections-3.2.1.jar"/>
+ <library name="commons-exec-1.1.jar"/>
+ <library name="commons-io-2.4.jar"/>
+ <library name="commons-jxpath-1.3.jar"/>
+ <library name="commons-lang3-3.3.2.jar"/>
+ <library name="commons-logging-1.1.3.jar"/>
+ <library name="cssparser-0.9.14.jar"/>
+ <library name="gson-2.3.jar"/>
+ <library name="guava-18.0.jar"/>
+ <library name="htmlunit-2.15.jar"/>
+ <library name="htmlunit-core-js-2.15.jar"/>
+ <library name="httpclient-4.3.4.jar"/>
+ <library name="httpcore-4.3.2.jar"/>
+ <library name="httpmime-4.3.3.jar"/>
+ <library name="ini4j-0.5.2.jar"/>
+ <library name="jetty-http-8.1.15.v20140411.jar"/>
+ <library name="jetty-io-8.1.15.v20140411.jar"/>
+ <library name="jetty-util-8.1.15.v20140411.jar"/>
+ <library name="jetty-websocket-8.1.15.v20140411.jar"/>
+ <library name="jna-3.4.0.jar"/>
+ <library name="nekohtml-1.9.21.jar"/>
+ <library name="netty-3.5.2.Final.jar"/>
+ <library name="operadriver-1.5.jar"/>
+ <library name="operalaunchers-1.1.jar"/>
+ <library name="platform-3.4.0.jar"/>
+ <library name="protobuf-java-2.4.1.jar"/>
+ <library name="sac-1.3.jar"/>
+ <library name="selenium-api-2.44.0.jar"/>
+ <library name="selenium-chrome-driver-2.44.0.jar"/>
+ <library name="selenium-firefox-driver-2.44.0.jar"/>
+ <library name="selenium-htmlunit-driver-2.44.0.jar"/>
+ <library name="selenium-ie-driver-2.44.0.jar"/>
+ <library name="selenium-java-2.44.0.jar"/>
+ <library name="selenium-remote-driver-2.44.0.jar"/>
+ <library name="selenium-safari-driver-2.44.0.jar"/>
+ <library name="selenium-support-2.44.0.jar"/>
+ <library name="serializer-2.7.1.jar"/>
+ <library name="webbit-0.4.14.jar"/>
+ <library name="xalan-2.7.1.jar"/>
+ <library name="xercesImpl-2.11.0.jar"/>
+ <library name="xml-apis-1.4.01.jar"/>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-http"/>
+ <import plugin="lib-selenium"/>
+ </requires>
+
+ <extension id="org.apache.nutch.protocol.selenium"
+ name="HttpProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
+ <implementation id="org.apache.nutch.protocol.selenium.Http"
+ class="org.apache.nutch.protocol.selenium.Http">
+ <parameter name="protocolName" value="http"/>
+ </implementation>
+
+ </extension>
+
+</plugin>
Added:
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java?rev=1662530&view=auto
==============================================================================
---
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
(added)
+++
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
Thu Feb 26 18:31:39 2015
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.selenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+ public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+ public Http() {
+ super(LOG);
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Http http = new Http();
+ http.setConf(NutchConfiguration.create());
+ main(http, args);
+ }
+
+ @Override
+ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+ throws ProtocolException, IOException {
+ return new HttpResponse(this, url, datum);
+ }
+
+}
Added:
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1662530&view=auto
==============================================================================
---
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
(added)
+++
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
Thu Feb 26 18:31:39 2015
@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn
borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+ private Http http;
+ private URL url;
+ private String orig;
+ private String base;
+ private byte[] content;
+ private int code;
+ private Metadata headers = new SpellCheckedMetadata();
+
+ /** The nutch configuration */
+ private Configuration conf = null;
+
+ public HttpResponse(Http http, URL url, CrawlDatum datum) throws
ProtocolException, IOException {
+
+ this.conf = http.getConf();
+ this.http = http;
+ this.url = url;
+ this.orig = url.toString();
+ this.base = url.toString();
+
+ if (!"http".equals(url.getProtocol()))
+ throw new HttpException("Not an HTTP url:" + url);
+
+ if (Http.LOG.isTraceEnabled()) {
+ Http.LOG.trace("fetching " + url);
+ }
+
+ String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+ // some servers will redirect a request with a host line like
+ // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+ // don't want the :80...
+
+ String host = url.getHost();
+ int port;
+ String portString;
+ if (url.getPort() == -1) {
+ port = 80;
+ portString = "";
+ } else {
+ port = url.getPort();
+ portString = ":" + port;
+ }
+ Socket socket = null;
+
+ try {
+ socket = new Socket(); // create the socket
+ socket.setSoTimeout(http.getTimeout());
+
+ // connect
+ String sockHost = http.useProxy() ? http.getProxyHost() : host;
+ int sockPort = http.useProxy() ? http.getProxyPort() : port;
+ InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+ socket.connect(sockAddr, http.getTimeout());
+
+ // make request
+ OutputStream req = socket.getOutputStream();
+
+ StringBuffer reqStr = new StringBuffer("GET ");
+ if (http.useProxy()) {
+ reqStr.append(url.getProtocol() + "://" + host + portString + path);
+ } else {
+ reqStr.append(path);
+ }
+
+ reqStr.append(" HTTP/1.0\r\n");
+
+ reqStr.append("Host: ");
+ reqStr.append(host);
+ reqStr.append(portString);
+ reqStr.append("\r\n");
+
+ reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+ String userAgent = http.getUserAgent();
+ if ((userAgent == null) || (userAgent.length() == 0)) {
+ if (Http.LOG.isErrorEnabled()) {
+ Http.LOG.error("User-agent is not set!");
+ }
+ } else {
+ reqStr.append("User-Agent: ");
+ reqStr.append(userAgent);
+ reqStr.append("\r\n");
+ }
+
+ reqStr.append("Accept-Language: ");
+ reqStr.append(this.http.getAcceptLanguage());
+ reqStr.append("\r\n");
+
+ reqStr.append("Accept: ");
+ reqStr.append(this.http.getAccept());
+ reqStr.append("\r\n");
+
+ if (datum.getModifiedTime() > 0) {
+ reqStr.append("If-Modified-Since: " +
HttpDateFormat.toString(datum.getModifiedTime()));
+ reqStr.append("\r\n");
+ }
+ reqStr.append("\r\n");
+
+ byte[] reqBytes = reqStr.toString().getBytes();
+
+ req.write(reqBytes);
+ req.flush();
+
+ PushbackInputStream in = // process response
+ new PushbackInputStream(new
BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+ Http.BUFFER_SIZE);
+
+ StringBuffer line = new StringBuffer();
+
+ boolean haveSeenNonContinueStatus = false;
+ while (!haveSeenNonContinueStatus) {
+ // parse status code line
+ this.code = parseStatusLine(in, line);
+ // parse headers
+ parseHeaders(in, line);
+ haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+ }
+
+ // Get Content type header
+ String contentType = getHeader(Response.CONTENT_TYPE);
+
+ // handle with Selenium only if content type in HTML or XHTML
+ if (contentType != null) {
+ if (contentType.contains("text/html") ||
contentType.contains("application/xhtml")) {
+ readPlainContent(url);
+ } else {
+ try {
+ int contentLength = Integer.MAX_VALUE;
+ String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+ if (contentLengthString != null) {
+ try {
+ contentLength = Integer.parseInt(contentLengthString.trim());
+ } catch (NumberFormatException ex) {
+ throw new HttpException("bad content length: " +
contentLengthString);
+ }
+ }
+
+ if (http.getMaxContent() >= 0 && contentLength >
http.getMaxContent()) {
+ contentLength = http.getMaxContent();
+ }
+
+ byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+ int bufferFilled = 0;
+ int totalRead = 0;
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+ && totalRead + bufferFilled <= contentLength) {
+ totalRead += bufferFilled;
+ out.write(buffer, 0, bufferFilled);
+ }
+
+ content = out.toByteArray();
+
+ } catch (Exception e) {
+ if (code == 200)
+ throw new IOException(e.toString());
+ // for codes other than 200 OK, we are fine with empty content
+ } finally {
+ if (in != null) {
+ in.close();
+ }
+ }
+ }
+ }
+
+ } finally {
+ if (socket != null)
+ socket.close();
+ }
+ }
+
+ /* ------------------------- *
+ * <implementation:Response> *
+ * ------------------------- */
+
+ public URL getUrl() {
+ return url;
+ }
+
+ public int getCode() {
+ return code;
+ }
+
+ public String getHeader(String name) {
+ return headers.get(name);
+ }
+
+ public Metadata getHeaders() {
+ return headers;
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ /* ------------------------- *
+ * <implementation:Response> *
+ * ------------------------- */
+
+ private void readPlainContent(URL url) throws IOException {
+ String page = HttpWebClient.getHtmlPage(url.toString(), conf);
+
+ content = page.getBytes("UTF-8");
+ }
+
+ private int parseStatusLine(PushbackInputStream in, StringBuffer line)
throws IOException, HttpException {
+ readLine(in, line, false);
+
+ int codeStart = line.indexOf(" ");
+ int codeEnd = line.indexOf(" ", codeStart + 1);
+
+ // handle lines with no plaintext result code, ie:
+ // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+ if (codeEnd == -1)
+ codeEnd = line.length();
+
+ int code;
+ try {
+ code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+ } catch (NumberFormatException e) {
+ throw new HttpException("bad status line '" + line + "': " +
e.getMessage(), e);
+ }
+
+ return code;
+ }
+
+ private void processHeaderLine(StringBuffer line) throws IOException,
HttpException {
+
+ int colonIndex = line.indexOf(":"); // key is up to colon
+ if (colonIndex == -1) {
+ int i;
+ for (i = 0; i < line.length(); i++)
+ if (!Character.isWhitespace(line.charAt(i)))
+ break;
+ if (i == line.length())
+ return;
+ throw new HttpException("No colon in header:" + line);
+ }
+ String key = line.substring(0, colonIndex);
+
+ int valueStart = colonIndex + 1; // skip whitespace
+ while (valueStart < line.length()) {
+ int c = line.charAt(valueStart);
+ if (c != ' ' && c != '\t')
+ break;
+ valueStart++;
+ }
+ String value = line.substring(valueStart);
+ headers.set(key, value);
+ }
+
+ // Adds headers to our headers Metadata
+ private void parseHeaders(PushbackInputStream in, StringBuffer line) throws
IOException, HttpException {
+
+ while (readLine(in, line, true) != 0) {
+
+ // handle HTTP responses with missing blank line after headers
+ int pos;
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos =
line.indexOf("<HTML")) != -1)
+ || ((pos = line.indexOf("<html")) != -1)) {
+
+ in.unread(line.substring(pos).getBytes("UTF-8"));
+ line.setLength(pos);
+
+ try {
+ //TODO: (CM) We don't know the header names here
+ //since we're just handling them generically. It would
+ //be nice to provide some sort of mapping function here
+ //for the returned header names to the standard metadata
+ //names in the ParseData class
+ processHeaderLine(line);
+ } catch (Exception e) {
+ // fixme:
+ Http.LOG.warn("Error: ", e);
+ }
+ return;
+ }
+
+ processHeaderLine(line);
+ }
+ }
+
+ private static int readLine(PushbackInputStream in, StringBuffer line,
boolean allowContinuedLine)
+ throws IOException {
+ line.setLength(0);
+ for (int c = in.read(); c != -1; c = in.read()) {
+ switch (c) {
+ case '\r':
+ if (peek(in) == '\n') {
+ in.read();
+ }
+ case '\n':
+ if (line.length() > 0) {
+ // at EOL -- check for continued line if the current
+ // (possibly continued) line wasn't blank
+ if (allowContinuedLine)
+ switch (peek(in)) {
+ case ' ':
+ case '\t': // line is continued
+ in.read();
+ continue;
+ }
+ }
+ return line.length(); // else complete
+ default:
+ line.append((char) c);
+ }
+ }
+ throw new EOFException();
+ }
+
+ private static int peek(PushbackInputStream in) throws IOException {
+ int value = in.read();
+ in.unread(value);
+ return value;
+ }
+}
Added:
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html?rev=1662530&view=auto
==============================================================================
---
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
(added)
+++
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
Thu Feb 26 18:31:39 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>
Added:
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html?rev=1662530&view=auto
==============================================================================
---
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
(added)
+++
nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
Thu Feb 26 18:31:39 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the
htmlunit.</p><p></p>
+</body>
+</html>