Author: jnioche
Date: Fri Jan 18 11:53:13 2013
New Revision: 1435101
URL: http://svn.apache.org/viewvc?rev=1435101&view=rev
Log:
NUTCH-840 Port tests from parse-html to parse-tika
Added:
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
nutch/trunk/src/plugin/parse-tika/build.xml
nutch/trunk/src/plugin/parse-tika/plugin.xml
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1435101&r1=1435100&r2=1435101&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 18 11:53:13 2013
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-840 Port tests from parse-html to parse-tika (lewismc, jnioche)
+
* NUTCH-1509 Implement read/write in NutchField (markus)
* NUTCH-1507 Remove FetcherOutput (markus)
Modified:
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1435101&r1=1435100&r2=1435101&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
(original)
+++
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Fri Jan 18 11:53:13 2013
@@ -363,7 +363,7 @@ public class TestDOMContentUtils extends
if (testDOMs[0] == null)
setup();
for (int i= 0; i < testPages.length; i++) {
- ArrayList outlinks= new ArrayList();
+ ArrayList<Outlink> outlinks= new ArrayList<Outlink>();
if (i == SKIP) {
conf.setBoolean("parser.html.form.use_action", false);
utils.setConf(conf);
Modified: nutch/trunk/src/plugin/parse-tika/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/build.xml?rev=1435101&r1=1435100&r2=1435101&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/build.xml Fri Jan 18 11:53:13 2013
@@ -18,11 +18,24 @@
<project name="parse-tika" default="jar-core">
<import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-nekohtml" />
+ </target>
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-nekohtml/*.jar" />
+ </fileset>
+ </path>
+
<!-- Deploy Unit test dependencies -->
<target name="deps-test">
<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
<ant target="deploy" inheritall="false" dir="../protocol-file"/>
+ <ant target="deploy" inheritall="false" dir="../lib-nekohtml" />
</target>
<!-- for junit test -->
Modified: nutch/trunk/src/plugin/parse-tika/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/plugin.xml?rev=1435101&r1=1435100&r2=1435101&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/plugin.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/plugin.xml Fri Jan 18 11:53:13 2013
@@ -63,9 +63,9 @@
<requires>
<import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-nekohtml"/>
</requires>
-
<extension point="org.apache.nutch.parse.Parser"
id="org.apache.nutch.parse.tika"
name="TikaParser">
Modified:
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1435101&r1=1435100&r2=1435101&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
(original)
+++
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
Fri Jan 18 11:53:13 2013
@@ -38,7 +38,7 @@ import org.w3c.dom.NodeList;
* DOM nodes, such as getOutlinks, getText, etc.
*
*/
-class DOMContentUtils {
+public class DOMContentUtils {
private static class LinkParams {
private String elName;
@@ -59,11 +59,11 @@ class DOMContentUtils {
private HashMap<String,LinkParams> linkParams = new
HashMap<String,LinkParams>();
private Configuration conf;
- DOMContentUtils(Configuration conf) {
+ public DOMContentUtils(Configuration conf) {
setConf(conf);
}
- private void setConf(Configuration conf) {
+ public void setConf(Configuration conf) {
// forceTags is used to override configurable tag ignoring, later on
Collection<String> forceTags = new ArrayList<String>(1);
@@ -119,7 +119,7 @@ class DOMContentUtils {
* #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*
*/
- void getText(StringBuffer sb, Node node) {
+ public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
}
@@ -175,7 +175,7 @@ class DOMContentUtils {
*
* @return true if a title node was found, false otherwise
*/
- boolean getTitle(StringBuffer sb, Node node) {
+ public boolean getTitle(StringBuffer sb, Node node) {
NodeWalker walker = new NodeWalker(node);
@@ -314,7 +314,7 @@ class DOMContentUtils {
* nodes (this is a common DOM-fixup artifact, at least with
* nekohtml).
*/
- void getOutlinks(URL base, ArrayList<Outlink> outlinks,
+ public void getOutlinks(URL base, ArrayList<Outlink> outlinks,
Node node) {
NodeWalker walker = new NodeWalker(node);
Modified:
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=1435101&r1=1435100&r2=1435101&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
(original)
+++
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
Fri Jan 18 11:53:13 2013
@@ -28,7 +28,7 @@ import org.w3c.dom.*;
* noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
* instructions. All meta directives are stored in a HTMLMetaTags instance.
*/
-class HTMLMetaProcessor {
+public class HTMLMetaProcessor {
/**
* Utility class with indicators for the robots directives "noindex"
@@ -40,7 +40,7 @@ class HTMLMetaProcessor {
* values, based on any META tags found under the given
* <code>node</code>.
*/
- static final void getMetaTags (
+ public static final void getMetaTags (
HTMLMetaTags metaTags, Node node, URL currURL) {
metaTags.reset();
Added:
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java?rev=1435101&view=auto
==============================================================================
---
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
(added)
+++
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
Fri Jan 18 11:53:13 2013
@@ -0,0 +1,333 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.tika.DOMContentUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils extends TestCase {
+
+ private static final String[] testPages = {
+
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"http://www.nutch.org\">"
+ + " anchor </a><!--comment-->" + "</body></html>"),
+
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+ + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+ + "</body></html>"),
+
+ new String("<html><head><title> </title>" + "</head><body> "
+ + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+ + "</a></a>" + "</body></html>"),
+
+ // this one relies on certain neko fixup behavior, possibly
+ // distributing the anchors into the LI's-but not the other
+ // anchors (outside of them, instead)! So you get a tree that
+ // looks like:
+ // ... <li> <a href=/> home </a> </li>
+ // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+ // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+ new String("<html><head><title> my title </title>"
+ + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+ + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+ + "</body></html>"),
+
+ // test frameset link extraction. The invalid frame in the middle
+ // will be
+ // fixed to a third standalone frame.
+ new String("<html><head><title> my title </title>"
+ + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+ + "</frame>" + "<frameset cols=\"20,*\">"
+ + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+ + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+ + "</frameset>" + "</frameset>" + "</body></html>"),
+
+ // test <area> and <iframe> link extraction + url normalization
+ new String(
+ "<html><head><title> my title </title>"
+ + "</head><body>"
+ + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+ + "<map name=\"green\">"
+ + "<area shape=\"polygon\" coords=\"19,44,45,11,87\"
href=\"../index.html\">"
+ + "<area shape=\"rect\" coords=\"128,132,241,179\"
href=\"#bottom\">"
+ + "<area shape=\"circle\" coords=\"68,211,35\"
href=\"../bot.html\">"
+ + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+ + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+
+ // test whitespace processing for plain text extraction
+ new String(
+ "<html><head>\n <title> my\t\n title\r\n </title>\n"
+ + " </head>\n"
+ + " <body>\n"
+ + " <h1> Whitespace\ttest </h1> \n"
+ + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a>
\t\n"
+ + " <p> This is<span> a whitespace<span></span> test</span>.
Newlines\n"
+ + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+ + " This\t<b>is a</b> break -><br>and the line after<i>
break</i>.<br>\n"
+ + "<table>"
+ + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+ + " <tr><td>space here </td><td> space there</td><td>no
space</td></tr>"
+ +
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+ + "</table>put some text here<Br>and there."
+ + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ + " . . . ." + "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
+ // test that POST form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // test that all form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
};
+
+ private static int SKIP = 9;
+
+ private static String[] testBaseHrefs = { "http://www.nutch.org",
+ "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+ "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+ "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+ "http://www.nutch.org//", "http://www.nutch.org/",
+ "http://www.nutch.org/", "http://www.nutch.org/",
+ "http://www.nutch.org/;something" };
+
+ private static final DocumentFragment testDOMs[] = new
DocumentFragment[testPages.length];
+
+ private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+ private static final String[] answerText = {
+ "title body anchor",
+ "title body home bots",
+ "separate this from this",
+ "my title body home 1 2",
+ "my title",
+ "my title the bottom",
+ "my title Whitespace test whitespace test "
+ + "This is a whitespace test . Newlines should appear as space too. "
+ + "Tabs are spaces too. This is a break -> and the line after break
. "
+ + "one two three space here space there no space "
+ + "one two two three three four put some text here and there. "
+ + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+ "test1 test2", "title anchor1 anchor2 anchor3",
+ "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+ private static final String[] answerTitle = { "title", "title", "",
+ "my title", "my title", "my title", "my title", "", "", "", "title",
+ "title" };
+
+ // note: should be in page-order
+ private static Outlink[][] answerOutlinks;
+
+ private static Configuration conf;
+ private static DOMContentUtils utils = null;
+
+ public TestDOMContentUtils(String name) {
+ super(name);
+ }
+
+ private static void setup() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils = new DOMContentUtils(conf);
+ DOMFragmentParser parser = new DOMFragmentParser();
+ for (int i = 0; i < testPages.length; i++) {
+ DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+ try {
+ parser.parse(
+ new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+ node);
+ testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+ } catch (Exception e) {
+ assertTrue("caught exception: " + e, false);
+ }
+ testDOMs[i] = node;
+ }
+ answerOutlinks = new Outlink[][] {
+ { new Outlink("http://www.nutch.org", "anchor"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+ { new Outlink("http://www.nutch.org/", "separate this"),
+ new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/1", "1"),
+ new Outlink("http://www.nutch.org/docs/2", "2"), },
+ { new Outlink("http://www.nutch.org/frames/top.html", ""),
+ new Outlink("http://www.nutch.org/frames/left.html", ""),
+ new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+ new Outlink("http://www.nutch.org/frames/right.html", ""), },
+ { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+ new Outlink("http://www.nutch.org/index.html", ""),
+ new Outlink("http://www.nutch.org/maps/#bottom", ""),
+ new Outlink("http://www.nutch.org/bot.html", ""),
+ new Outlink("http://www.nutch.org/docs/index.html", ""), },
+ { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+ {},
+ { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+ {},
+ { new Outlink("http://www.nutch.org/;x", "anchor1"),
+ new Outlink("http://www.nutch.org/g;x", "anchor2"),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+ {
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g", "anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+ "anchor5") } };
+
+ }
+
+ private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+ StringTokenizer st1 = new StringTokenizer(s1);
+ StringTokenizer st2 = new StringTokenizer(s2);
+
+ while (st1.hasMoreTokens()) {
+ if (!st2.hasMoreTokens())
+ return false;
+ if (!st1.nextToken().equals(st2.nextToken()))
+ return false;
+ }
+ if (st2.hasMoreTokens())
+ return false;
+ return true;
+ }
+
+ public void testGetText() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
+ utils.getText(sb, testDOMs[i]);
+ String text = sb.toString();
+ assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
+ equalsIgnoreWhitespace(answerText[i], text));
+ }
+ }
+
+ public void testGetTitle() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
+ utils.getTitle(sb, testDOMs[i]);
+ String text = sb.toString();
+ assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
+ equalsIgnoreWhitespace(answerTitle[i], text));
+ }
+ }
+
+ public void testGetOutlinks() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+ if (i == SKIP) {
+ conf.setBoolean("parser.html.form.use_action", false);
+ utils.setConf(conf);
+ } else {
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils.setConf(conf);
+ }
+ utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+ Outlink[] outlinkArr = new Outlink[outlinks.size()];
+ outlinkArr = outlinks.toArray(outlinkArr);
+ compareOutlinks(answerOutlinks[i], outlinkArr);
+ }
+ }
+
+ private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+ for (int i = 0; i < o.length; i++) {
+ sb.append(o[i].toString());
+ sb.append(System.getProperty("line.separator"));
+ }
+ }
+
+ private static final String outlinksString(Outlink[] o) {
+ StringBuffer sb = new StringBuffer();
+ appendOutlinks(sb, o);
+ return sb.toString();
+ }
+
+ private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+ if (o1.length != o2.length) {
+ assertTrue(
+ "got wrong number of outlinks (expecting " + o1.length + ", got "
+ + o2.length + ")" + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + outlinksString(o1) + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + outlinksString(o2) + System.getProperty("line.separator"),
+ false);
+ }
+
+ for (int i = 0; i < o1.length; i++) {
+ if (!o1[i].equals(o2[i])) {
+ assertTrue(
+ "got wrong outlinks at position " + i
+ + System.getProperty("line.separator") + "answer: "
+ + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+ + "', anchor: '" + o1[i].getAnchor() + "'"
+ + System.getProperty("line.separator") + "got: "
+ + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+ + "', anchor: '" + o2[i].getAnchor() + "'", false);
+ }
+ }
+ }
+}
Added:
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java?rev=1435101&view=auto
==============================================================================
---
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
(added)
+++
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
Fri Jan 18 11:53:13 2013
@@ -0,0 +1,183 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.tika.HTMLMetaProcessor;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor extends TestCase {
+ public TestRobotsMetaProcessor(String name) {
+ super(name);
+ }
+
+ /*
+
+ some sample tags:
+
+ <meta name="robots" content="index,follow">
+ <meta name="robots" content="noindex,follow">
+ <meta name="robots" content="index,nofollow">
+ <meta name="robots" content="noindex,nofollow">
+
+ <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+
+ */
+
+
+ public static String[] tests=
+ {
+ "<html><head><title>test page</title>"
+ + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+ + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"all\"> "
+ + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+ + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"none\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,follow\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,nofollow\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,follow\"> "
+ + "<base href=\"http://www.nutch.org/\">"
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\"> "
+ + "<base href=\"http://www.nutch.org/base/\">"
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ };
+
+ public static final boolean[][] answers= {
+ {true, true, true}, // NONE
+ {false, false, true}, // all
+ {true, true, true}, // nOnE
+ {true, true, false}, // none
+ {true, true, false}, // noindex,nofollow
+ {true, false, false}, // noindex,follow
+ {false, true, false}, // index,nofollow
+ {false, false, false}, // index,follow
+ {false, false, false}, // missing!
+ };
+
+ private URL[][] currURLsAndAnswers;
+
+ public void testRobotsMetaProcessor() {
+ DOMFragmentParser parser= new DOMFragmentParser();;
+
+ try {
+ currURLsAndAnswers= new URL[][] {
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org/foo/"),
+ new URL("http://www.nutch.org/")},
+ {new URL("http://www.nutch.org"),
+ new URL("http://www.nutch.org/base/")}
+ };
+ } catch (Exception e) {
+ assertTrue("couldn't make test URLs!", false);
+ }
+
+ for (int i= 0; i < tests.length; i++) {
+ byte[] bytes= tests[i].getBytes();
+
+ DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+ try {
+ parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ HTMLMetaTags robotsMeta= new HTMLMetaTags();
+ HTMLMetaProcessor.getMetaTags(robotsMeta, node,
+ currURLsAndAnswers[i][0]);
+
+ assertTrue("got index wrong on test " + i,
+ robotsMeta.getNoIndex() == answers[i][0]);
+ assertTrue("got follow wrong on test " + i,
+ robotsMeta.getNoFollow() == answers[i][1]);
+ assertTrue("got cache wrong on test " + i,
+ robotsMeta.getNoCache() == answers[i][2]);
+ assertTrue("got base href wrong on test " + i + " (got "
+ + robotsMeta.getBaseHref() + ")",
+ ( (robotsMeta.getBaseHref() == null)
+ && (currURLsAndAnswers[i][1] == null) )
+ || ( (robotsMeta.getBaseHref() != null)
+ && robotsMeta.getBaseHref().equals(
+ currURLsAndAnswers[i][1]) ) );
+
+ }
+ }
+
+}