[55/69] [abbrv] nutch git commit: Moved test sources to maven standard directory

thammegowda Tue, 05 Jul 2016 15:49:33 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
new file mode 100644
index 0000000..96029a6
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
@@ -0,0 +1,337 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.tika.DOMContentUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils {
+
+  private static final String[] testPages = {
+
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\";>"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+
+      // test frameset link extraction. The invalid frame in the middle
+      // will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" 
href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" 
href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  
\t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. 
Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> 
break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no 
space</td></tr>"
+              + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), 
};
+
+  private static int SKIP = 9;
+
+  private static String[] testBaseHrefs = { "http://www.nutch.org";,
+      "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
+      "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
+      "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
+      "http://www.nutch.org//";, "http://www.nutch.org/";,
+      "http://www.nutch.org/";, "http://www.nutch.org/";,
+      "http://www.nutch.org/;something"; };
+
+  private static final DocumentFragment testDOMs[] = new 
DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break 
. "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title" };
+
+  // note: should be in page-order
+  private static Outlink[][] answerOutlinks;
+
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+
+  @Before
+  public void setup() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
+    DOMFragmentParser parser = new DOMFragmentParser();
+    parser.setFeature(
+        "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe";,
+        true);
+    for (int i = 0; i < testPages.length; i++) {
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+      try {
+        parser.parse(
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+            node);
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+      } catch (Exception e) {
+        Assert.assertTrue("caught exception: " + e, false);
+      }
+      testDOMs[i] = node;
+    }
+    answerOutlinks = new Outlink[][] {
+        { new Outlink("http://www.nutch.org";, "anchor"), },
+        { new Outlink("http://www.nutch.org/";, "home"),
+            new Outlink("http://www.nutch.org/docs/bot.html";, "bots"), },
+        { new Outlink("http://www.nutch.org/";, "separate this"),
+            new Outlink("http://www.nutch.org/docs/ok";, "from this"), },
+        { new Outlink("http://www.nutch.org/";, "home"),
+            new Outlink("http://www.nutch.org/docs/1";, "1"),
+            new Outlink("http://www.nutch.org/docs/2";, "2"), },
+        { new Outlink("http://www.nutch.org/frames/top.html";, ""),
+            new Outlink("http://www.nutch.org/frames/left.html";, ""),
+            new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
+            new Outlink("http://www.nutch.org/frames/right.html";, ""), },
+        { new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
+            new Outlink("http://www.nutch.org/index.html";, ""),
+            new Outlink("http://www.nutch.org/maps/#bottom";, ""),
+            new Outlink("http://www.nutch.org/bot.html";, ""),
+            new Outlink("http://www.nutch.org/docs/index.html";, ""), },
+        { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), },
+        {},
+        { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
+        {},
+        { new Outlink("http://www.nutch.org/;x";, "anchor1"),
+            new Outlink("http://www.nutch.org/g;x";, "anchor2"),
+            new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3") },
+        {
+            // this is tricky - see RFC3986 section 5.4.1 example 7
+            new Outlink("http://www.nutch.org/g";, "anchor1"),
+            new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
+            new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
+            new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
+            new Outlink("http://www.nutch.org/;something?y=1;somethingelse";,
+                "anchor5") } };
+
+  }
+
+  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
+
+    while (st1.hasMoreTokens()) {
+      if (!st2.hasMoreTokens())
+        return false;
+      if (!st1.nextToken().equals(st2.nextToken()))
+        return false;
+    }
+    if (st2.hasMoreTokens())
+      return false;
+    return true;
+  }
+
+  @Test
+  public void testGetText() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getText(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerText[i], text));
+    }
+  }
+
+  @Test
+  public void testGetTitle() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getTitle(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerTitle[i], text));
+    }
+  }
+
+  @Test
+  public void testGetOutlinks() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = outlinks.toArray(outlinkArr);
+      compareOutlinks(answerOutlinks[i], outlinkArr);
+    }
+  }
+
+  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+    for (int i = 0; i < o.length; i++) {
+      sb.append(o[i].toString());
+      sb.append(System.getProperty("line.separator"));
+    }
+  }
+
+  private static final String outlinksString(Outlink[] o) {
+    StringBuffer sb = new StringBuffer();
+    appendOutlinks(sb, o);
+    return sb.toString();
+  }
+
+  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+    if (o1.length != o2.length) {
+      Assert.assertTrue(
+          "got wrong number of outlinks (expecting " + o1.length + ", got "
+              + o2.length + ")" + System.getProperty("line.separator")
+              + "answer: " + System.getProperty("line.separator")
+              + outlinksString(o1) + System.getProperty("line.separator")
+              + "got: " + System.getProperty("line.separator")
+              + outlinksString(o2) + System.getProperty("line.separator"),
+          false);
+    }
+
+    for (int i = 0; i < o1.length; i++) {
+      if (!o1[i].equals(o2[i])) {
+        Assert.assertTrue(
+            "got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
+      }
+    }
+  }
+}


http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
new file mode 100644
index 0000000..c9394dc
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * 
+ * @author mattmann / jnioche
+ * 
+ *         Test Suite for the RSS feeds with the {@link TikaParser}.
+ * 
+ */
+public class TestFeedParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  private String[] sampleFiles = { "rsstest.rss" };
+
+  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+      .getName());
+
+  /**
+   * <p>
+   * The test method: tests out the following 2 asserts:
+   * </p>
+   * 
+   * <ul>
+   * <li>There are 3 outlinks read from the sample rss file</li>
+   * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+   * file</li>
+   * </ul>
+   */
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      // check that there are 2 outlinks:
+      // unlike the original parse-rss
+      // tika ignores the URL and description of the channel
+
+      // http://test.channel.com
+      // http://www-scf.usc.edu/~mattmann/
+      // http://www.nutch.org
+
+      ParseData theParseData = parse.getData();
+
+      Outlink[] theOutlinks = theParseData.getOutlinks();
+
+      Assert.assertTrue("There aren't 2 outlinks read!",
+          theOutlinks.length == 2);
+
+      // now check to make sure that those are the two outlinks
+      boolean hasLink1 = false, hasLink2 = false;
+
+      for (int j = 0; j < theOutlinks.length; j++) {
+        if (theOutlinks[j].getToUrl().equals(
+            "http://www-scf.usc.edu/~mattmann/";)) {
+          hasLink1 = true;
+        }
+
+        if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/";)) {
+          hasLink2 = true;
+        }
+      }
+
+      if (!hasLink1 || !hasLink2) {
+        Assert.fail("Outlinks read from sample rss file are not correct!");
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
new file mode 100644
index 0000000..b1762e6
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test extraction of image metadata
+ */
+public class TestImageMetadata {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  private String[] sampleFiles = { "nutch_logo_tm.gif", };
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      Assert.assertEquals("121", parse.getData().getMeta("width"));
+      Assert.assertEquals("48", parse.getData().getMeta("height"));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
new file mode 100644
index 0000000..576b3df
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Unit tests for MSWordParser.
+ * 
+ * @author John Xing
+ */
+public class TestMSWordParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-msword/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+  private String[] sampleFiles = { "word97.doc" };
+
+  private String expectedText = "This is a sample doc file prepared for 
nutch.";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
+
+  public String getTextContent(String fileName) throws ProtocolException,
+      ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+        .get(content.getUrl());
+    return parse.getText();
+  }
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      String found = getTextContent(sampleFiles[i]);
+      Assert.assertTrue("text found : '" + found + "'",
+          found.startsWith(expectedText));
+    }
+  }
+
+  @Test
+  public void testOpeningDocs() throws ProtocolException, ParseException {
+    String[] filenames = new File(sampleDir).list();
+    for (int i = 0; i < filenames.length; i++) {
+      if (filenames[i].endsWith(".doc") == false)
+        continue;
+      Assert.assertTrue("cann't read content of " + filenames[i],
+          getTextContent(filenames[i]).length() > 0);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
new file mode 100644
index 0000000..6960bad
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for OOParser.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-oo/build.xml during plugin compilation.
+  private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
+
+  private String expectedText;
+
+  private String sampleText = "ootest.txt";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    Protocol protocol;
+    ProtocolFactory factory = new ProtocolFactory(conf);
+
+    System.out.println("Expected : " + expectedText);
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      if (sampleFiles[i].startsWith("ootest") == false)
+        continue;
+
+      protocol = factory.getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+      // simply test for the presence of a text - the ordering of the elements
+      // may differ from what was expected
+      // in the previous tests
+      Assert.assertTrue(text != null && text.length() > 0);
+
+      System.out.println("Found " + sampleFiles[i] + ": " + text);
+    }
+  }
+
+  public TestOOParser() {
+    try {
+      // read the test string
+      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+          + sampleText);
+      StringBuffer sb = new StringBuffer();
+      int len = 0;
+      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+      char[] buf = new char[1024];
+      while ((len = isr.read(buf)) > 0) {
+        sb.append(buf, 0, len);
+      }
+      isr.close();
+      expectedText = sb.toString();
+      // normalize space
+      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
new file mode 100644
index 0000000..9884f0c
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for PdfParser.
+ * 
+ * @author John Xing
+ */
+public class TestPdfParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+  private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+  private String expectedText = "A VERY SMALL PDF FILE";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      int index = parse.getText().indexOf(expectedText);
+      Assert.assertTrue(index > 0);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
new file mode 100644
index 0000000..f15d821
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tika;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
+ * 
+ * @author Andy Hedges
+ */
+public class TestRTFParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+  private String rtfFile = "test.rtf";
+
+  @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+    protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+        .getContent();
+    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
+        content.getUrl());
+    String text = parse.getText();
+    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
+        text.trim());
+
+    String title = parse.getData().getTitle();
+    Metadata meta = parse.getData().getParseMeta();
+
+    Assert.assertEquals("test rft document", title);
+    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
new file mode 100644
index 0000000..4224f93
--- /dev/null
+++ 
b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.tika.HTMLMetaProcessor;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor {
+
+  /*
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+   */
+
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
+
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
+  };
+
+  private URL[][] currURLsAndAnswers;
+
+  @Test
+  public void testRobotsMetaProcessor() {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
+
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org/foo/";),
+              new URL("http://www.nutch.org/";) },
+          { new URL("http://www.nutch.org";),
+              new URL("http://www.nutch.org/base/";) } };
+    } catch (Exception e) {
+      Assert.assertTrue("couldn't make test URLs!", false);
+    }
+
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
+
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+      try {
+        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
currURLsAndAnswers[i][0]);
+
+      Assert.assertTrue("got index wrong on test " + i,
+          robotsMeta.getNoIndex() == answers[i][0]);
+      Assert.assertTrue("got follow wrong on test " + i,
+          robotsMeta.getNoFollow() == answers[i][1]);
+      Assert.assertTrue("got cache wrong on test " + i,
+          robotsMeta.getNoCache() == answers[i][2]);
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] 
== null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
deleted file mode 100644
index 96029a6..0000000
--- 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
+++ /dev/null
@@ -1,337 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.tika.DOMContentUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.ByteArrayInputStream;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.StringTokenizer;
-
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-import org.cyberneko.html.parsers.DOMFragmentParser;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Unit tests for DOMContentUtils.
- */
-public class TestDOMContentUtils {
-
-  private static final String[] testPages = {
-
-      new String("<html><head><title> title </title><script> script </script>"
-          + "</head><body> body <a href=\"http://www.nutch.org\";>"
-          + " anchor </a><!--comment-->" + "</body></html>"),
-
-      new String("<html><head><title> title </title><script> script </script>"
-          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
-          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
-          + "</body></html>"),
-
-      new String("<html><head><title> </title>" + "</head><body> "
-          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
-          + "</a></a>" + "</body></html>"),
-
-      // this one relies on certain neko fixup behavior, possibly
-      // distributing the anchors into the LI's-but not the other
-      // anchors (outside of them, instead)! So you get a tree that
-      // looks like:
-      // ... <li> <a href=/> home </a> </li>
-      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
-      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
-      new String("<html><head><title> my title </title>"
-          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
-          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
-          + "</body></html>"),
-
-      // test frameset link extraction. The invalid frame in the middle
-      // will be
-      // fixed to a third standalone frame.
-      new String("<html><head><title> my title </title>"
-          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
-          + "</frame>" + "<frameset cols=\"20,*\">"
-          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
-          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
-          + "</frameset>" + "</frameset>" + "</body></html>"),
-
-      // test <area> and <iframe> link extraction + url normalization
-      new String(
-          "<html><head><title> my title </title>"
-              + "</head><body>"
-              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
-              + "<map name=\"green\">"
-              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" 
href=\"../index.html\">"
-              + "<area shape=\"rect\" coords=\"128,132,241,179\" 
href=\"#bottom\">"
-              + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
-              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
-              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
-
-      // test whitespace processing for plain text extraction
-      new String(
-          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
-              + " </head>\n"
-              + " <body>\n"
-              + "    <h1> Whitespace\ttest  </h1> \n"
-              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  
\t\n"
-              + "    <p> This is<span> a whitespace<span></span> test</span>. 
Newlines\n"
-              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
-              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> 
break</i>.<br>\n"
-              + "<table>"
-              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
-              + "    <tr><td>space here </td><td> space there</td><td>no 
space</td></tr>"
-              + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
-              + "</table>put some text here<Br>and there."
-              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
-              + "         .        .        .         ." + "</body>  </html>"),
-
-      // test that <a rel=nofollow> links are not returned
-      new String("<html><head></head><body>"
-          + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
-          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
-          + "</body></html>"),
-      // test that POST form actions are skipped
-      new String("<html><head></head><body>"
-          + "<form method='POST' action='/search.jsp'><input type=text>"
-          + "<input type=submit><p>test1</p></form>"
-          + "<form method='GET' action='/dummy.jsp'><input type=text>"
-          + "<input type=submit><p>test2</p></form></body></html>"),
-      // test that all form actions are skipped
-      new String("<html><head></head><body>"
-          + "<form method='POST' action='/search.jsp'><input type=text>"
-          + "<input type=submit><p>test1</p></form>"
-          + "<form method='GET' action='/dummy.jsp'><input type=text>"
-          + "<input type=submit><p>test2</p></form></body></html>"),
-      new String("<html><head><title> title </title>" + "</head><body>"
-          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
-          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
-      new String("<html><head><title> title </title>" + "</head><body>"
-          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
-          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
-          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), 
};
-
-  private static int SKIP = 9;
-
-  private static String[] testBaseHrefs = { "http://www.nutch.org";,
-      "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
-      "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
-      "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
-      "http://www.nutch.org//";, "http://www.nutch.org/";,
-      "http://www.nutch.org/";, "http://www.nutch.org/";,
-      "http://www.nutch.org/;something"; };
-
-  private static final DocumentFragment testDOMs[] = new 
DocumentFragment[testPages.length];
-
-  private static URL[] testBaseHrefURLs = new URL[testPages.length];
-
-  private static final String[] answerText = {
-      "title body anchor",
-      "title body home bots",
-      "separate this from this",
-      "my title body home 1 2",
-      "my title",
-      "my title the bottom",
-      "my title Whitespace test whitespace test "
-          + "This is a whitespace test . Newlines should appear as space too. "
-          + "Tabs are spaces too. This is a break -> and the line after break 
. "
-          + "one two three space here space there no space "
-          + "one two two three three four put some text here and there. "
-          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
-      "test1 test2", "title anchor1 anchor2 anchor3",
-      "title anchor1 anchor2 anchor3 anchor4 anchor5" };
-
-  private static final String[] answerTitle = { "title", "title", "",
-      "my title", "my title", "my title", "my title", "", "", "", "title",
-      "title" };
-
-  // note: should be in page-order
-  private static Outlink[][] answerOutlinks;
-
-  private static Configuration conf;
-  private static DOMContentUtils utils = null;
-
-  @Before
-  public void setup() throws Exception {
-    conf = NutchConfiguration.create();
-    conf.setBoolean("parser.html.form.use_action", true);
-    utils = new DOMContentUtils(conf);
-    DOMFragmentParser parser = new DOMFragmentParser();
-    parser.setFeature(
-        "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe";,
-        true);
-    for (int i = 0; i < testPages.length; i++) {
-      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
-      try {
-        parser.parse(
-            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
-            node);
-        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
-      } catch (Exception e) {
-        Assert.assertTrue("caught exception: " + e, false);
-      }
-      testDOMs[i] = node;
-    }
-    answerOutlinks = new Outlink[][] {
-        { new Outlink("http://www.nutch.org";, "anchor"), },
-        { new Outlink("http://www.nutch.org/";, "home"),
-            new Outlink("http://www.nutch.org/docs/bot.html";, "bots"), },
-        { new Outlink("http://www.nutch.org/";, "separate this"),
-            new Outlink("http://www.nutch.org/docs/ok";, "from this"), },
-        { new Outlink("http://www.nutch.org/";, "home"),
-            new Outlink("http://www.nutch.org/docs/1";, "1"),
-            new Outlink("http://www.nutch.org/docs/2";, "2"), },
-        { new Outlink("http://www.nutch.org/frames/top.html";, ""),
-            new Outlink("http://www.nutch.org/frames/left.html";, ""),
-            new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
-            new Outlink("http://www.nutch.org/frames/right.html";, ""), },
-        { new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
-            new Outlink("http://www.nutch.org/index.html";, ""),
-            new Outlink("http://www.nutch.org/maps/#bottom";, ""),
-            new Outlink("http://www.nutch.org/bot.html";, ""),
-            new Outlink("http://www.nutch.org/docs/index.html";, ""), },
-        { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), },
-        {},
-        { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
-        {},
-        { new Outlink("http://www.nutch.org/;x";, "anchor1"),
-            new Outlink("http://www.nutch.org/g;x";, "anchor2"),
-            new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3") },
-        {
-            // this is tricky - see RFC3986 section 5.4.1 example 7
-            new Outlink("http://www.nutch.org/g";, "anchor1"),
-            new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
-            new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
-            new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
-            new Outlink("http://www.nutch.org/;something?y=1;somethingelse";,
-                "anchor5") } };
-
-  }
-
-  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
-    StringTokenizer st1 = new StringTokenizer(s1);
-    StringTokenizer st2 = new StringTokenizer(s2);
-
-    while (st1.hasMoreTokens()) {
-      if (!st2.hasMoreTokens())
-        return false;
-      if (!st1.nextToken().equals(st2.nextToken()))
-        return false;
-    }
-    if (st2.hasMoreTokens())
-      return false;
-    return true;
-  }
-
-  @Test
-  public void testGetText() throws Exception {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      StringBuffer sb = new StringBuffer();
-      utils.getText(sb, testDOMs[i]);
-      String text = sb.toString();
-      Assert.assertTrue(
-          "expecting text: " + answerText[i]
-              + System.getProperty("line.separator")
-              + System.getProperty("line.separator") + "got text: " + text,
-          equalsIgnoreWhitespace(answerText[i], text));
-    }
-  }
-
-  @Test
-  public void testGetTitle() throws Exception {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      StringBuffer sb = new StringBuffer();
-      utils.getTitle(sb, testDOMs[i]);
-      String text = sb.toString();
-      Assert.assertTrue(
-          "expecting text: " + answerText[i]
-              + System.getProperty("line.separator")
-              + System.getProperty("line.separator") + "got text: " + text,
-          equalsIgnoreWhitespace(answerTitle[i], text));
-    }
-  }
-
-  @Test
-  public void testGetOutlinks() throws Exception {
-    if (testDOMs[0] == null)
-      setup();
-    for (int i = 0; i < testPages.length; i++) {
-      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
-      if (i == SKIP) {
-        conf.setBoolean("parser.html.form.use_action", false);
-        utils.setConf(conf);
-      } else {
-        conf.setBoolean("parser.html.form.use_action", true);
-        utils.setConf(conf);
-      }
-      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
-      Outlink[] outlinkArr = new Outlink[outlinks.size()];
-      outlinkArr = outlinks.toArray(outlinkArr);
-      compareOutlinks(answerOutlinks[i], outlinkArr);
-    }
-  }
-
-  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
-    for (int i = 0; i < o.length; i++) {
-      sb.append(o[i].toString());
-      sb.append(System.getProperty("line.separator"));
-    }
-  }
-
-  private static final String outlinksString(Outlink[] o) {
-    StringBuffer sb = new StringBuffer();
-    appendOutlinks(sb, o);
-    return sb.toString();
-  }
-
-  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
-    if (o1.length != o2.length) {
-      Assert.assertTrue(
-          "got wrong number of outlinks (expecting " + o1.length + ", got "
-              + o2.length + ")" + System.getProperty("line.separator")
-              + "answer: " + System.getProperty("line.separator")
-              + outlinksString(o1) + System.getProperty("line.separator")
-              + "got: " + System.getProperty("line.separator")
-              + outlinksString(o2) + System.getProperty("line.separator"),
-          false);
-    }
-
-    for (int i = 0; i < o1.length; i++) {
-      if (!o1[i].equals(o2[i])) {
-        Assert.assertTrue(
-            "got wrong outlinks at position " + i
-                + System.getProperty("line.separator") + "answer: "
-                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
-                + "', anchor: '" + o1[i].getAnchor() + "'"
-                + System.getProperty("line.separator") + "got: "
-                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
-                + "', anchor: '" + o2[i].getAnchor() + "'", false);
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
deleted file mode 100644
index c9394dc..0000000
--- 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.junit.Assert;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.tika.TikaParser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * 
- * @author mattmann / jnioche
- * 
- *         Test Suite for the RSS feeds with the {@link TikaParser}.
- * 
- */
-public class TestFeedParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  private String[] sampleFiles = { "rsstest.rss" };
-
-  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
-      .getName());
-
-  /**
-   * <p>
-   * The test method: tests out the following 2 asserts:
-   * </p>
-   * 
-   * <ul>
-   * <li>There are 3 outlinks read from the sample rss file</li>
-   * <li>The 3 outlinks read are in fact the correct outlinks from the sample
-   * file</li>
-   * </ul>
-   */
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    Configuration conf = NutchConfiguration.create();
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      // check that there are 2 outlinks:
-      // unlike the original parse-rss
-      // tika ignores the URL and description of the channel
-
-      // http://test.channel.com
-      // http://www-scf.usc.edu/~mattmann/
-      // http://www.nutch.org
-
-      ParseData theParseData = parse.getData();
-
-      Outlink[] theOutlinks = theParseData.getOutlinks();
-
-      Assert.assertTrue("There aren't 2 outlinks read!",
-          theOutlinks.length == 2);
-
-      // now check to make sure that those are the two outlinks
-      boolean hasLink1 = false, hasLink2 = false;
-
-      for (int j = 0; j < theOutlinks.length; j++) {
-        if (theOutlinks[j].getToUrl().equals(
-            "http://www-scf.usc.edu/~mattmann/";)) {
-          hasLink1 = true;
-        }
-
-        if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/";)) {
-          hasLink2 = true;
-        }
-      }
-
-      if (!hasLink1 || !hasLink2) {
-        Assert.fail("Outlinks read from sample rss file are not correct!");
-      }
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
deleted file mode 100644
index b1762e6..0000000
--- 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Test extraction of image metadata
- */
-public class TestImageMetadata {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  private String[] sampleFiles = { "nutch_logo_tm.gif", };
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      Configuration conf = NutchConfiguration.create();
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      Assert.assertEquals("121", parse.getData().getMeta("width"));
-      Assert.assertEquals("48", parse.getData().getMeta("height"));
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
deleted file mode 100644
index 576b3df..0000000
--- 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
-
-/**
- * Unit tests for MSWordParser.
- * 
- * @author John Xing
- */
-public class TestMSWordParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-msword/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
-  private String[] sampleFiles = { "word97.doc" };
-
-  private String expectedText = "This is a sample doc file prepared for 
nutch.";
-
-  private Configuration conf;
-
-  @Before
-  public void setUp() {
-    conf = NutchConfiguration.create();
-    conf.set("file.content.limit", "-1");
-  }
-
-  public String getTextContent(String fileName) throws ProtocolException,
-      ParseException {
-    String urlString = "file:" + sampleDir + fileSeparator + fileName;
-    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    Content content = protocol.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
-    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-        .get(content.getUrl());
-    return parse.getText();
-  }
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    for (int i = 0; i < sampleFiles.length; i++) {
-      String found = getTextContent(sampleFiles[i]);
-      Assert.assertTrue("text found : '" + found + "'",
-          found.startsWith(expectedText));
-    }
-  }
-
-  @Test
-  public void testOpeningDocs() throws ProtocolException, ParseException {
-    String[] filenames = new File(sampleDir).list();
-    for (int i = 0; i < filenames.length; i++) {
-      if (filenames[i].endsWith(".doc") == false)
-        continue;
-      Assert.assertTrue("cann't read content of " + filenames[i],
-          getTextContent(filenames[i]).length() > 0);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
deleted file mode 100644
index 6960bad..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.*;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Unit tests for OOParser.
- * 
- * @author Andrzej Bialecki
- */
-public class TestOOParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-oo/build.xml during plugin compilation.
-  private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
-
-  private String expectedText;
-
-  private String sampleText = "ootest.txt";
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Content content;
-    Parse parse;
-    Configuration conf = NutchConfiguration.create();
-    Protocol protocol;
-    ProtocolFactory factory = new ProtocolFactory(conf);
-
-    System.out.println("Expected : " + expectedText);
-
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      if (sampleFiles[i].startsWith("ootest") == false)
-        continue;
-
-      protocol = factory.getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
-
-      // simply test for the presence of a text - the ordering of the elements
-      // may differ from what was expected
-      // in the previous tests
-      Assert.assertTrue(text != null && text.length() > 0);
-
-      System.out.println("Found " + sampleFiles[i] + ": " + text);
-    }
-  }
-
-  public TestOOParser() {
-    try {
-      // read the test string
-      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
-          + sampleText);
-      StringBuffer sb = new StringBuffer();
-      int len = 0;
-      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
-      char[] buf = new char[1024];
-      while ((len = isr.read(buf)) > 0) {
-        sb.append(buf, 0, len);
-      }
-      isr.close();
-      expectedText = sb.toString();
-      // normalize space
-      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
-    } catch (Exception e) {
-      e.printStackTrace();
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
deleted file mode 100644
index 9884f0c..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Unit tests for PdfParser.
- * 
- * @author John Xing
- */
-public class TestPdfParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
-  private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
-
-  private String expectedText = "A VERY SMALL PDF FILE";
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      Configuration conf = NutchConfiguration.create();
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      int index = parse.getText().indexOf(expectedText);
-      Assert.assertTrue(index > 0);
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
----------------------------------------------------------------------
diff --git 
a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java 
b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
deleted file mode 100644
index f15d821..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.tika;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.DublinCore;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.junit.Assert;
-import org.junit.Ignore;
-import org.junit.Test;
-
-/**
- * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
- * 
- * @author Andy Hedges
- */
-public class TestRTFParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
-  private String rtfFile = "test.rtf";
-
-  @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
-    Configuration conf = NutchConfiguration.create();
-    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
-    protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
-        .getContent();
-    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
-        content.getUrl());
-    String text = parse.getText();
-    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
-        text.trim());
-
-    String title = parse.getData().getTitle();
-    Metadata meta = parse.getData().getParseMeta();
-
-    Assert.assertEquals("test rft document", title);
-    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
-
-  }
-}

[55/69] [abbrv] nutch git commit: Moved test sources to maven standard directory

Reply via email to