http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java new file mode 100644 index 0000000..96029a6 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java @@ -0,0 +1,337 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.tika.DOMContentUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.ByteArrayInputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; +import org.cyberneko.html.parsers.DOMFragmentParser; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * Unit tests for DOMContentUtils. + */ +public class TestDOMContentUtils { + + private static final String[] testPages = { + + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + "</body></html>"), + + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" + + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" + + "</body></html>"), + + new String("<html><head><title> </title>" + "</head><body> " + + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" + + "</a></a>" + "</body></html>"), + + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" + + "</body></html>"), + + // test frameset link extraction. The invalid frame in the middle + // will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" + + "</frame>" + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" + + "</frame>" + "<frame src=\"right.html\">" + "</frame>" + + "</frameset>" + "</frameset>" + "</body></html>"), + + // test <area> and <iframe> link extraction + url normalization + new String( + "<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), + + // test whitespace processing for plain text extraction + new String( + "<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), }; + + private static int SKIP = 9; + + private static String[] testBaseHrefs = { "http://www.nutch.org", + "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", "http://www.nutch.org/", + "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/;something" }; + + private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; + + private static URL[] testBaseHrefURLs = new URL[testPages.length]; + + private static final String[] answerText = { + "title body anchor", + "title body home bots", + "separate this from this", + "my title body home 1 2", + "my title", + "my title the bottom", + "my title Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5" }; + + private static final String[] answerTitle = { "title", "title", "", + "my title", "my title", "my title", "my title", "", "", "", "title", + "title" }; + + // note: should be in page-order + private static Outlink[][] answerOutlinks; + + private static Configuration conf; + private static DOMContentUtils utils = null; + + @Before + public void setup() throws Exception { + conf = NutchConfiguration.create(); + conf.setBoolean("parser.html.form.use_action", true); + utils = new DOMContentUtils(conf); + DOMFragmentParser parser = new DOMFragmentParser(); + parser.setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + for (int i = 0; i < testPages.length; i++) { + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + try { + parser.parse( + new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), + node); + testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); + } catch (Exception e) { + Assert.assertTrue("caught exception: " + e, false); + } + testDOMs[i] = node; + } + answerOutlinks = new Outlink[][] { + { new Outlink("http://www.nutch.org", "anchor"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, + { new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/ok", "from this"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, + { new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, + { new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", ""), }, + { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + {}, + { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, + {}, + { new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, + { + // this is tricky - see RFC3986 section 5.4.1 example 7 + new Outlink("http://www.nutch.org/g", "anchor1"), + new Outlink("http://www.nutch.org/g?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/;something?y=1;somethingelse", + "anchor5") } }; + + } + + private static boolean equalsIgnoreWhitespace(String s1, String s2) { + StringTokenizer st1 = new StringTokenizer(s1); + StringTokenizer st2 = new StringTokenizer(s2); + + while (st1.hasMoreTokens()) { + if (!st2.hasMoreTokens()) + return false; + if (!st1.nextToken().equals(st2.nextToken())) + return false; + } + if (st2.hasMoreTokens()) + return false; + return true; + } + + @Test + public void testGetText() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getText(sb, testDOMs[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerText[i], text)); + } + } + + @Test + public void testGetTitle() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getTitle(sb, testDOMs[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerTitle[i], text)); + } + } + + @Test + public void testGetOutlinks() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); + if (i == SKIP) { + conf.setBoolean("parser.html.form.use_action", false); + utils.setConf(conf); + } else { + conf.setBoolean("parser.html.form.use_action", true); + utils.setConf(conf); + } + utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); + Outlink[] outlinkArr = new Outlink[outlinks.size()]; + outlinkArr = outlinks.toArray(outlinkArr); + compareOutlinks(answerOutlinks[i], outlinkArr); + } + } + + private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { + for (int i = 0; i < o.length; i++) { + sb.append(o[i].toString()); + sb.append(System.getProperty("line.separator")); + } + } + + private static final String outlinksString(Outlink[] o) { + StringBuffer sb = new StringBuffer(); + appendOutlinks(sb, o); + return sb.toString(); + } + + private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { + if (o1.length != o2.length) { + Assert.assertTrue( + "got wrong number of outlinks (expecting " + o1.length + ", got " + + o2.length + ")" + System.getProperty("line.separator") + + "answer: " + System.getProperty("line.separator") + + outlinksString(o1) + System.getProperty("line.separator") + + "got: " + System.getProperty("line.separator") + + outlinksString(o2) + System.getProperty("line.separator"), + false); + } + + for (int i = 0; i < o1.length; i++) { + if (!o1[i].equals(o2[i])) { + Assert.assertTrue( + "got wrong outlinks at position " + i + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + "'" + o1[i].getToUrl() + + "', anchor: '" + o1[i].getAnchor() + "'" + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + "'" + o2[i].getToUrl() + + "', anchor: '" + o2[i].getAnchor() + "'", false); + } + } + } +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java new file mode 100644 index 0000000..c9394dc --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.tika.TikaParser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; + +/** + * + * @author mattmann / jnioche + * + * Test Suite for the RSS feeds with the {@link TikaParser}. + * + */ +public class TestFeedParser { + + private String fileSeparator = System.getProperty("file.separator"); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + private String[] sampleFiles = { "rsstest.rss" }; + + public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class + .getName()); + + /** + * <p> + * The test method: tests out the following 2 asserts: + * </p> + * + * <ul> + * <li>There are 3 outlinks read from the sample rss file</li> + * <li>The 3 outlinks read are in fact the correct outlinks from the sample + * file</li> + * </ul> + */ + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + Configuration conf = NutchConfiguration.create(); + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + // check that there are 2 outlinks: + // unlike the original parse-rss + // tika ignores the URL and description of the channel + + // http://test.channel.com + // http://www-scf.usc.edu/~mattmann/ + // http://www.nutch.org + + ParseData theParseData = parse.getData(); + + Outlink[] theOutlinks = theParseData.getOutlinks(); + + Assert.assertTrue("There aren't 2 outlinks read!", + theOutlinks.length == 2); + + // now check to make sure that those are the two outlinks + boolean hasLink1 = false, hasLink2 = false; + + for (int j = 0; j < theOutlinks.length; j++) { + if (theOutlinks[j].getToUrl().equals( + "http://www-scf.usc.edu/~mattmann/")) { + hasLink1 = true; + } + + if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) { + hasLink2 = true; + } + } + + if (!hasLink1 || !hasLink2) { + Assert.fail("Outlinks read from sample rss file are not correct!"); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java new file mode 100644 index 0000000..b1762e6 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Test; + +/** + * Test extraction of image metadata + */ +public class TestImageMetadata { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + private String[] sampleFiles = { "nutch_logo_tm.gif", }; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + Configuration conf = NutchConfiguration.create(); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + Assert.assertEquals("121", parse.getData().getMeta("width")); + Assert.assertEquals("48", parse.getData().getMeta("height")); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java new file mode 100644 index 0000000..576b3df --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java @@ -0,0 +1,92 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; + +/** + * Unit tests for MSWordParser. + * + * @author John Xing + */ +public class TestMSWordParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-msword/build.xml during plugin compilation. + // Check ./src/plugin/parse-msword/sample/README.txt for what they are. + private String[] sampleFiles = { "word97.doc" }; + + private String expectedText = "This is a sample doc file prepared for nutch."; + + private Configuration conf; + + @Before + public void setUp() { + conf = NutchConfiguration.create(); + conf.set("file.content.limit", "-1"); + } + + public String getTextContent(String fileName) throws ProtocolException, + ParseException { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + return parse.getText(); + } + + @Test + public void testIt() throws ProtocolException, ParseException { + for (int i = 0; i < sampleFiles.length; i++) { + String found = getTextContent(sampleFiles[i]); + Assert.assertTrue("text found : '" + found + "'", + found.startsWith(expectedText)); + } + } + + @Test + public void testOpeningDocs() throws ProtocolException, ParseException { + String[] filenames = new File(sampleDir).list(); + for (int i = 0; i < filenames.length; i++) { + if (filenames[i].endsWith(".doc") == false) + continue; + Assert.assertTrue("cann't read content of " + filenames[i], + getTextContent(filenames[i]).length() > 0); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java new file mode 100644 index 0000000..6960bad --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import java.io.FileInputStream; +import java.io.InputStreamReader; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.protocol.*; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for OOParser. + * + * @author Andrzej Bialecki + */ +public class TestOOParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-oo/build.xml during plugin compilation. + private String[] sampleFiles = { "ootest.odt", "ootest.sxw" }; + + private String expectedText; + + private String sampleText = "ootest.txt"; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Content content; + Parse parse; + Configuration conf = NutchConfiguration.create(); + Protocol protocol; + ProtocolFactory factory = new ProtocolFactory(conf); + + System.out.println("Expected : " + expectedText); + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + if (sampleFiles[i].startsWith("ootest") == false) + continue; + + protocol = factory.getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); + + // simply test for the presence of a text - the ordering of the elements + // may differ from what was expected + // in the previous tests + Assert.assertTrue(text != null && text.length() > 0); + + System.out.println("Found " + sampleFiles[i] + ": " + text); + } + } + + public TestOOParser() { + try { + // read the test string + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + + sampleText); + StringBuffer sb = new StringBuffer(); + int len = 0; + InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); + char[] buf = new char[1024]; + while ((len = isr.read(buf)) > 0) { + sb.append(buf, 0, len); + } + isr.close(); + expectedText = sb.toString(); + // normalize space + expectedText = expectedText.replaceAll("[ \t\r\n]+", " "); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java new file mode 100644 index 0000000..9884f0c --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for PdfParser. + * + * @author John Xing + */ +public class TestPdfParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-pdf/build.xml during plugin compilation. + // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. + private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; + + private String expectedText = "A VERY SMALL PDF FILE"; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + Configuration conf = NutchConfiguration.create(); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + + int index = parse.getText().indexOf(expectedText); + Assert.assertTrue(index > 0); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java new file mode 100644 index 0000000..f15d821 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.tika; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). + * + * @author Andy Hedges + */ +public class TestRTFParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-rtf/build.xml during plugin compilation. + // Check ./src/plugin/parse-rtf/sample/README.txt for what they are. + private String rtfFile = "test.rtf"; + + @Ignore("There seems to be an issue with line 71 e.g. text.trim()") + @Test + public void testIt() throws ProtocolException, ParseException { + + String urlString; + Protocol protocol; + Content content; + Parse parse; + + Configuration conf = NutchConfiguration.create(); + urlString = "file:" + sampleDir + fileSeparator + rtfFile; + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) + .getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get( + content.getUrl()); + String text = parse.getText(); + Assert.assertEquals("The quick brown fox jumps over the lazy dog", + text.trim()); + + String title = parse.getData().getTitle(); + Metadata meta = parse.getData().getParseMeta(); + + Assert.assertEquals("test rft document", title); + Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT)); + + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java new file mode 100644 index 0000000..4224f93 --- /dev/null +++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java @@ -0,0 +1,156 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.tika.HTMLMetaProcessor; + +import java.io.ByteArrayInputStream; +import java.net.URL; + +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; +import org.cyberneko.html.parsers.DOMFragmentParser; +import org.junit.Assert; +import org.junit.Test; + +/** Unit tests for HTMLMetaProcessor. */ +public class TestRobotsMetaProcessor { + + /* + * + * some sample tags: + * + * <meta name="robots" content="index,follow"> <meta name="robots" + * content="noindex,follow"> <meta name="robots" content="index,nofollow"> + * <meta name="robots" content="noindex,nofollow"> + * + * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> + */ + + public static String[] tests = { + "<html><head><title>test page</title>" + + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"all\"> " + + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,follow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,follow\"> " + + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + + " some text" + "</body></html>", + + }; + + public static final boolean[][] answers = { { true, true, true }, // NONE + { false, false, true }, // all + { true, true, true }, // nOnE + { true, true, false }, // none + { true, true, false }, // noindex,nofollow + { true, false, false }, // noindex,follow + { false, true, false }, // index,nofollow + { false, false, false }, // index,follow + { false, false, false }, // missing! + }; + + private URL[][] currURLsAndAnswers; + + @Test + public void testRobotsMetaProcessor() { + DOMFragmentParser parser = new DOMFragmentParser(); + ; + + try { + currURLsAndAnswers = new URL[][] { + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/") }, + { new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/") } }; + } catch (Exception e) { + Assert.assertTrue("couldn't make test URLs!", false); + } + + for (int i = 0; i < tests.length; i++) { + byte[] bytes = tests[i].getBytes(); + + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + + try { + parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); + } catch (Exception e) { + e.printStackTrace(); + } + + HTMLMetaTags robotsMeta = new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); + + Assert.assertTrue("got index wrong on test " + i, + robotsMeta.getNoIndex() == answers[i][0]); + Assert.assertTrue("got follow wrong on test " + i, + robotsMeta.getNoFollow() == answers[i][1]); + Assert.assertTrue("got cache wrong on test " + i, + robotsMeta.getNoCache() == answers[i][2]); + Assert + .assertTrue( + "got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) + || ((robotsMeta.getBaseHref() != null) && robotsMeta + .getBaseHref().equals(currURLsAndAnswers[i][1]))); + + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java deleted file mode 100644 index 96029a6..0000000 --- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java +++ /dev/null @@ -1,337 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.tika.DOMContentUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; - -import java.io.ByteArrayInputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.StringTokenizer; - -import org.xml.sax.*; -import org.w3c.dom.*; -import org.apache.html.dom.*; -import org.cyberneko.html.parsers.DOMFragmentParser; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * Unit tests for DOMContentUtils. - */ -public class TestDOMContentUtils { - - private static final String[] testPages = { - - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"http://www.nutch.org\">" - + " anchor </a><!--comment-->" + "</body></html>"), - - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" - + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" - + "</body></html>"), - - new String("<html><head><title> </title>" + "</head><body> " - + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" - + "</a></a>" + "</body></html>"), - - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ... <li> <a href=/> home </a> </li> - // <li> <a href=/> <a href="1"> 1 </a> </a> </li> - // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> - new String("<html><head><title> my title </title>" - + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" - + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" - + "</body></html>"), - - // test frameset link extraction. The invalid frame in the middle - // will be - // fixed to a third standalone frame. - new String("<html><head><title> my title </title>" - + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" - + "</frame>" + "<frameset cols=\"20,*\">" - + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" - + "</frame>" + "<frame src=\"right.html\">" + "</frame>" - + "</frameset>" + "</frameset>" + "</body></html>"), - - // test <area> and <iframe> link extraction + url normalization - new String( - "<html><head><title> my title </title>" - + "</head><body>" - + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" - + "<map name=\"green\">" - + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" - + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" - + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" - + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " - + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), - - // test whitespace processing for plain text extraction - new String( - "<html><head>\n <title> my\t\n title\r\n </title>\n" - + " </head>\n" - + " <body>\n" - + " <h1> Whitespace\ttest </h1> \n" - + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" - + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" - + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" - + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" - + "<table>" - + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" - + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" - + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" - + "</table>put some text here<Br>and there." - + "<h2>End\tthis\rmadness\n!</h2>\r\n" - + " . . . ." + "</body> </html>"), - - // test that <a rel=nofollow> links are not returned - new String("<html><head></head><body>" - + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" - + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" - + "</body></html>"), - // test that POST form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // test that all form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" - + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" - + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" - + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), }; - - private static int SKIP = 9; - - private static String[] testBaseHrefs = { "http://www.nutch.org", - "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", - "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", - "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", - "http://www.nutch.org//", "http://www.nutch.org/", - "http://www.nutch.org/", "http://www.nutch.org/", - "http://www.nutch.org/;something" }; - - private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; - - private static URL[] testBaseHrefURLs = new URL[testPages.length]; - - private static final String[] answerText = { - "title body anchor", - "title body home bots", - "separate this from this", - "my title body home 1 2", - "my title", - "my title the bottom", - "my title Whitespace test whitespace test " - + "This is a whitespace test . Newlines should appear as space too. " - + "Tabs are spaces too. This is a break -> and the line after break . " - + "one two three space here space there no space " - + "one two two three three four put some text here and there. " - + "End this madness ! . . . .", "ignore ignore", "test1 test2", - "test1 test2", "title anchor1 anchor2 anchor3", - "title anchor1 anchor2 anchor3 anchor4 anchor5" }; - - private static final String[] answerTitle = { "title", "title", "", - "my title", "my title", "my title", "my title", "", "", "", "title", - "title" }; - - // note: should be in page-order - private static Outlink[][] answerOutlinks; - - private static Configuration conf; - private static DOMContentUtils utils = null; - - @Before - public void setup() throws Exception { - conf = NutchConfiguration.create(); - conf.setBoolean("parser.html.form.use_action", true); - utils = new DOMContentUtils(conf); - DOMFragmentParser parser = new DOMFragmentParser(); - parser.setFeature( - "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", - true); - for (int i = 0; i < testPages.length; i++) { - DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); - try { - parser.parse( - new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), - node); - testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); - } catch (Exception e) { - Assert.assertTrue("caught exception: " + e, false); - } - testDOMs[i] = node; - } - answerOutlinks = new Outlink[][] { - { new Outlink("http://www.nutch.org", "anchor"), }, - { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, - { new Outlink("http://www.nutch.org/", "separate this"), - new Outlink("http://www.nutch.org/docs/ok", "from this"), }, - { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/1", "1"), - new Outlink("http://www.nutch.org/docs/2", "2"), }, - { new Outlink("http://www.nutch.org/frames/top.html", ""), - new Outlink("http://www.nutch.org/frames/left.html", ""), - new Outlink("http://www.nutch.org/frames/invalid.html", ""), - new Outlink("http://www.nutch.org/frames/right.html", ""), }, - { new Outlink("http://www.nutch.org/maps/logo.gif", ""), - new Outlink("http://www.nutch.org/index.html", ""), - new Outlink("http://www.nutch.org/maps/#bottom", ""), - new Outlink("http://www.nutch.org/bot.html", ""), - new Outlink("http://www.nutch.org/docs/index.html", ""), }, - { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, - {}, - { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, - {}, - { new Outlink("http://www.nutch.org/;x", "anchor1"), - new Outlink("http://www.nutch.org/g;x", "anchor2"), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, - { - // this is tricky - see RFC3986 section 5.4.1 example 7 - new Outlink("http://www.nutch.org/g", "anchor1"), - new Outlink("http://www.nutch.org/g?y#s", "anchor2"), - new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), - new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), - new Outlink("http://www.nutch.org/;something?y=1;somethingelse", - "anchor5") } }; - - } - - private static boolean equalsIgnoreWhitespace(String s1, String s2) { - StringTokenizer st1 = new StringTokenizer(s1); - StringTokenizer st2 = new StringTokenizer(s2); - - while (st1.hasMoreTokens()) { - if (!st2.hasMoreTokens()) - return false; - if (!st1.nextToken().equals(st2.nextToken())) - return false; - } - if (st2.hasMoreTokens()) - return false; - return true; - } - - @Test - public void testGetText() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - StringBuffer sb = new StringBuffer(); - utils.getText(sb, testDOMs[i]); - String text = sb.toString(); - Assert.assertTrue( - "expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") + "got text: " + text, - equalsIgnoreWhitespace(answerText[i], text)); - } - } - - @Test - public void testGetTitle() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - StringBuffer sb = new StringBuffer(); - utils.getTitle(sb, testDOMs[i]); - String text = sb.toString(); - Assert.assertTrue( - "expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") + "got text: " + text, - equalsIgnoreWhitespace(answerTitle[i], text)); - } - } - - @Test - public void testGetOutlinks() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); - if (i == SKIP) { - conf.setBoolean("parser.html.form.use_action", false); - utils.setConf(conf); - } else { - conf.setBoolean("parser.html.form.use_action", true); - utils.setConf(conf); - } - utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); - Outlink[] outlinkArr = new Outlink[outlinks.size()]; - outlinkArr = outlinks.toArray(outlinkArr); - compareOutlinks(answerOutlinks[i], outlinkArr); - } - } - - private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { - for (int i = 0; i < o.length; i++) { - sb.append(o[i].toString()); - sb.append(System.getProperty("line.separator")); - } - } - - private static final String outlinksString(Outlink[] o) { - StringBuffer sb = new StringBuffer(); - appendOutlinks(sb, o); - return sb.toString(); - } - - private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { - if (o1.length != o2.length) { - Assert.assertTrue( - "got wrong number of outlinks (expecting " + o1.length + ", got " - + o2.length + ")" + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + outlinksString(o1) + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + outlinksString(o2) + System.getProperty("line.separator"), - false); - } - - for (int i = 0; i < o1.length; i++) { - if (!o1[i].equals(o2[i])) { - Assert.assertTrue( - "got wrong outlinks at position " + i - + System.getProperty("line.separator") + "answer: " - + System.getProperty("line.separator") + "'" + o1[i].getToUrl() - + "', anchor: '" + o1[i].getAnchor() + "'" - + System.getProperty("line.separator") + "got: " - + System.getProperty("line.separator") + "'" + o2[i].getToUrl() - + "', anchor: '" + o2[i].getAnchor() + "'", false); - } - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java deleted file mode 100644 index c9394dc..0000000 --- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.tika.TikaParser; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; - -/** - * - * @author mattmann / jnioche - * - * Test Suite for the RSS feeds with the {@link TikaParser}. - * - */ -public class TestFeedParser { - - private String fileSeparator = System.getProperty("file.separator"); - - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - private String[] sampleFiles = { "rsstest.rss" }; - - public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class - .getName()); - - /** - * <p> - * The test method: tests out the following 2 asserts: - * </p> - * - * <ul> - * <li>There are 3 outlinks read from the sample rss file</li> - * <li>The 3 outlinks read are in fact the correct outlinks from the sample - * file</li> - * </ul> - */ - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - Configuration conf = NutchConfiguration.create(); - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - // check that there are 2 outlinks: - // unlike the original parse-rss - // tika ignores the URL and description of the channel - - // http://test.channel.com - // http://www-scf.usc.edu/~mattmann/ - // http://www.nutch.org - - ParseData theParseData = parse.getData(); - - Outlink[] theOutlinks = theParseData.getOutlinks(); - - Assert.assertTrue("There aren't 2 outlinks read!", - theOutlinks.length == 2); - - // now check to make sure that those are the two outlinks - boolean hasLink1 = false, hasLink2 = false; - - for (int j = 0; j < theOutlinks.length; j++) { - if (theOutlinks[j].getToUrl().equals( - "http://www-scf.usc.edu/~mattmann/")) { - hasLink1 = true; - } - - if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) { - hasLink2 = true; - } - } - - if (!hasLink1 || !hasLink2) { - Assert.fail("Outlinks read from sample rss file are not correct!"); - } - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java deleted file mode 100644 index b1762e6..0000000 --- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; -import org.junit.Test; - -/** - * Test extraction of image metadata - */ -public class TestImageMetadata { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - private String[] sampleFiles = { "nutch_logo_tm.gif", }; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - Configuration conf = NutchConfiguration.create(); - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - Assert.assertEquals("121", parse.getData().getMeta("width")); - Assert.assertEquals("48", parse.getData().getMeta("height")); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java deleted file mode 100644 index 576b3df..0000000 --- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java +++ /dev/null @@ -1,92 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.File; - -/** - * Unit tests for MSWordParser. - * - * @author John Xing - */ -public class TestMSWordParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-msword/build.xml during plugin compilation. - // Check ./src/plugin/parse-msword/sample/README.txt for what they are. - private String[] sampleFiles = { "word97.doc" }; - - private String expectedText = "This is a sample doc file prepared for nutch."; - - private Configuration conf; - - @Before - public void setUp() { - conf = NutchConfiguration.create(); - conf.set("file.content.limit", "-1"); - } - - public String getTextContent(String fileName) throws ProtocolException, - ParseException { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - return parse.getText(); - } - - @Test - public void testIt() throws ProtocolException, ParseException { - for (int i = 0; i < sampleFiles.length; i++) { - String found = getTextContent(sampleFiles[i]); - Assert.assertTrue("text found : '" + found + "'", - found.startsWith(expectedText)); - } - } - - @Test - public void testOpeningDocs() throws ProtocolException, ParseException { - String[] filenames = new File(sampleDir).list(); - for (int i = 0; i < filenames.length; i++) { - if (filenames[i].endsWith(".doc") == false) - continue; - Assert.assertTrue("cann't read content of " + filenames[i], - getTextContent(filenames[i]).length() > 0); - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java deleted file mode 100644 index 6960bad..0000000 --- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import java.io.FileInputStream; -import java.io.InputStreamReader; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.protocol.*; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -/** - * Unit tests for OOParser. - * - * @author Andrzej Bialecki - */ -public class TestOOParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-oo/build.xml during plugin compilation. - private String[] sampleFiles = { "ootest.odt", "ootest.sxw" }; - - private String expectedText; - - private String sampleText = "ootest.txt"; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Content content; - Parse parse; - Configuration conf = NutchConfiguration.create(); - Protocol protocol; - ProtocolFactory factory = new ProtocolFactory(conf); - - System.out.println("Expected : " + expectedText); - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - if (sampleFiles[i].startsWith("ootest") == false) - continue; - - protocol = factory.getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); - - // simply test for the presence of a text - the ordering of the elements - // may differ from what was expected - // in the previous tests - Assert.assertTrue(text != null && text.length() > 0); - - System.out.println("Found " + sampleFiles[i] + ": " + text); - } - } - - public TestOOParser() { - try { - // read the test string - FileInputStream fis = new FileInputStream(sampleDir + fileSeparator - + sampleText); - StringBuffer sb = new StringBuffer(); - int len = 0; - InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); - char[] buf = new char[1024]; - while ((len = isr.read(buf)) > 0) { - sb.append(buf, 0, len); - } - isr.close(); - expectedText = sb.toString(); - // normalize space - expectedText = expectedText.replaceAll("[ \t\r\n]+", " "); - } catch (Exception e) { - e.printStackTrace(); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java deleted file mode 100644 index 9884f0c..0000000 --- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.tika; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; -import org.junit.Test; - -/** - * Unit tests for PdfParser. - * - * @author John Xing - */ -public class TestPdfParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-pdf/build.xml during plugin compilation. - // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. - private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; - - private String expectedText = "A VERY SMALL PDF FILE"; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - Configuration conf = NutchConfiguration.create(); - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - int index = parse.getText().indexOf(expectedText); - Assert.assertTrue(index > 0); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java deleted file mode 100644 index f15d821..0000000 --- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.tika; - -// Nutch imports -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.DublinCore; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.junit.Assert; -import org.junit.Ignore; -import org.junit.Test; - -/** - * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). - * - * @author Andy Hedges - */ -public class TestRTFParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-rtf/build.xml during plugin compilation. - // Check ./src/plugin/parse-rtf/sample/README.txt for what they are. - private String rtfFile = "test.rtf"; - - @Ignore("There seems to be an issue with line 71 e.g. text.trim()") - @Test - public void testIt() throws ProtocolException, ParseException { - - String urlString; - Protocol protocol; - Content content; - Parse parse; - - Configuration conf = NutchConfiguration.create(); - urlString = "file:" + sampleDir + fileSeparator + rtfFile; - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) - .getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get( - content.getUrl()); - String text = parse.getText(); - Assert.assertEquals("The quick brown fox jumps over the lazy dog", - text.trim()); - - String title = parse.getData().getTitle(); - Metadata meta = parse.getData().getParseMeta(); - - Assert.assertEquals("test rft document", title); - Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT)); - - } -}
