http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java new file mode 100644 index 0000000..15725ae --- /dev/null +++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java @@ -0,0 +1,347 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import org.apache.nutch.parse.Outlink; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.ByteArrayInputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import org.cyberneko.html.parsers.*; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; + +/** + * Unit tests for DOMContentUtils. + */ +public class TestDOMContentUtils { + + private static final String[] testPages = { + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + "</body></html>"), + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" + + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" + + "</body></html>"), + new String("<html><head><title> </title>" + "</head><body> " + + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" + + "</a></a>" + "</body></html>"), + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" + + "</body></html>"), + // test frameset link extraction. The invalid frame in the middle will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" + + "</frame>" + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" + + "</frame>" + "<frame src=\"right.html\">" + "</frame>" + + "</frameset>" + "</frameset>" + "</body></html>"), + // test <area> and <iframe> link extraction + url normalization + new String( + "<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), + // test whitespace processing for plain text extraction + new String( + "<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\"><!--no anchor--></a>" + + "<a href=\"g1\"> <!--whitespace--> </a>" + + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>" + + "</body></html>"), }; + + private static int SKIP = 9; + + private static String[] testBaseHrefs = { "http://www.nutch.org", + "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", "http://www.nutch.org/", + "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/;something", "http://www.nutch.org/" }; + + private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; + + private static URL[] testBaseHrefURLs = new URL[testPages.length]; + + private static final String[] answerText = { + "title body anchor", + "title body home bots", + "separate this from this", + "my title body home 1 2", + "my title", + "my title the bottom", + "my title Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" }; + + private static final String[] answerTitle = { "title", "title", "", + "my title", "my title", "my title", "my title", "", "", "", "title", + "title", "title" }; + + // note: should be in page-order + private static Outlink[][] answerOutlinks; + + private static Configuration conf; + private static DOMContentUtils utils = null; + + @Before + public void setup() { + conf = NutchConfiguration.create(); + conf.setBoolean("parser.html.form.use_action", true); + utils = new DOMContentUtils(conf); + DOMFragmentParser parser = new DOMFragmentParser(); + try { + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + } catch (SAXException e) { + } + for (int i = 0; i < testPages.length; i++) { + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + try { + parser.parse( + new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), + node); + testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); + } catch (Exception e) { + Assert.assertTrue("caught exception: " + e, false); + } + testDOMs[i] = node; + } + try { + answerOutlinks = new Outlink[][] { + { new Outlink("http://www.nutch.org", "anchor"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, + { new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/ok", "from this"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, + { new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, + { new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", ""), }, + { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + {}, + { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, + {}, + { new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, + { + // this is tricky - see RFC3986 section 5.4.1 example 7 + new Outlink("http://www.nutch.org/g", "anchor1"), + new Outlink("http://www.nutch.org/g?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/;something?y=1;somethingelse", + "anchor5") }, + { new Outlink("http://www.nutch.org/g", ""), + new Outlink("http://www.nutch.org/g1", ""), + new Outlink("http://www.nutch.org/g2", "bla bla"), + new Outlink("http://www.nutch.org/test.gif", "bla bla"), } }; + + } catch (MalformedURLException e) { + + } + } + + private static boolean equalsIgnoreWhitespace(String s1, String s2) { + StringTokenizer st1 = new StringTokenizer(s1); + StringTokenizer st2 = new StringTokenizer(s2); + + while (st1.hasMoreTokens()) { + if (!st2.hasMoreTokens()) + return false; + if (!st1.nextToken().equals(st2.nextToken())) + return false; + } + if (st2.hasMoreTokens()) + return false; + return true; + } + + @Test + public void testGetText() { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getText(sb, testDOMs[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerText[i], text)); + } + } + + @Test + public void testGetTitle() { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getTitle(sb, testDOMs[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerTitle[i], text)); + } + } + + @Test + public void testGetOutlinks() { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); + if (i == SKIP) { + conf.setBoolean("parser.html.form.use_action", false); + utils.setConf(conf); + } else { + conf.setBoolean("parser.html.form.use_action", true); + utils.setConf(conf); + } + utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); + Outlink[] outlinkArr = new Outlink[outlinks.size()]; + outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr); + compareOutlinks(answerOutlinks[i], outlinkArr); + } + } + + private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { + for (int i = 0; i < o.length; i++) { + sb.append(o[i].toString()); + sb.append(System.getProperty("line.separator")); + } + } + + private static final String outlinksString(Outlink[] o) { + StringBuffer sb = new StringBuffer(); + appendOutlinks(sb, o); + return sb.toString(); + } + + private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { + if (o1.length != o2.length) { + Assert.assertTrue( + "got wrong number of outlinks (expecting " + o1.length + ", got " + + o2.length + ")" + System.getProperty("line.separator") + + "answer: " + System.getProperty("line.separator") + + outlinksString(o1) + System.getProperty("line.separator") + + "got: " + System.getProperty("line.separator") + + outlinksString(o2) + System.getProperty("line.separator"), + false); + } + + for (int i = 0; i < o1.length; i++) { + if (!o1[i].equals(o2[i])) { + Assert.assertTrue( + "got wrong outlinks at position " + i + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + "'" + o1[i].getToUrl() + + "', anchor: '" + o1[i].getAnchor() + "'" + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + "'" + o2[i].getToUrl() + + "', anchor: '" + o2[i].getAnchor() + "'", false); + + } + } + } +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java new file mode 100644 index 0000000..7099f50 --- /dev/null +++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.nio.charset.Charset; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.html.HtmlParser; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestHtmlParser { + + public static final Logger LOG = LoggerFactory + .getLogger(TestHtmlParser.class); + + private static final String encodingTestKeywords = "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά"; + private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>"; + private static final String encodingTestContent = "<title>" + + encodingTestKeywords + "</title>\n" + + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n" + + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; + + private static String[][] encodingTestPages = { + { + "HTML4, utf-8, meta http-equiv, no quotes", + "utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />" + + encodingTestContent }, + { + "HTML4, utf-8, meta http-equiv, single quotes", + "utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" + + encodingTestContent }, + { + "XHTML, utf-8, meta http-equiv, double quotes", + "utf-8", + "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" + + "<html>\n<head>\n" + + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" + + encodingTestContent }, + { + "HTML5, utf-8, meta charset", + "utf-8", + "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">" + + encodingTestContent }, + { "HTML5, utf-8, BOM", "utf-8", + "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent }, + { "HTML5, utf-16, BOM", "utf-16", + "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } }; + + private Configuration conf; + private Parser parser; + + public TestHtmlParser() { + conf = NutchConfiguration.create(); + parser = new HtmlParser(); + parser.setConf(conf); + } + + protected Parse parse(byte[] contentBytes) { + String dummyUrl = "http://dummy.url/"; + return parser.getParse( + new Content(dummyUrl, dummyUrl, contentBytes, "text/html", + new Metadata(), conf)).get(dummyUrl); + } + + @Test + public void testEncodingDetection() { + for (String[] testPage : encodingTestPages) { + String name = testPage[0]; + Charset charset = Charset.forName(testPage[1]); + byte[] contentBytes = testPage[2].getBytes(charset); + Parse parse = parse(contentBytes); + String text = parse.getText(); + String title = parse.getData().getTitle(); + String keywords = parse.getData().getMeta("keywords"); + LOG.info(name); + LOG.info("title:\t" + title); + LOG.info("keywords:\t" + keywords); + LOG.info("text:\t" + text); + Assert.assertEquals("Title not extracted properly (" + name + ")", + encodingTestKeywords, title); + for (String keyword : encodingTestKeywords.split(",\\s*")) { + Assert.assertTrue(keyword + " not found in text (" + name + ")", + text.contains(keyword)); + } + Assert.assertNotNull("No keywords extracted", keywords); + Assert.assertEquals("Keywords not extracted properly (" + name + ")", + encodingTestKeywords, keywords); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java new file mode 100644 index 0000000..5089a10 --- /dev/null +++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import org.apache.nutch.parse.HTMLMetaTags; + +import java.io.ByteArrayInputStream; +import java.net.URL; + +import org.cyberneko.html.parsers.*; +import org.junit.Assert; +import org.junit.Test; +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; + +/** Unit tests for HTMLMetaProcessor. */ +public class TestRobotsMetaProcessor { + + /* + * + * some sample tags: + * + * <meta name="robots" content="index,follow"> <meta name="robots" + * content="noindex,follow"> <meta name="robots" content="index,nofollow"> + * <meta name="robots" content="noindex,nofollow"> + * + * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> + */ + + public static String[] tests = { + "<html><head><title>test page</title>" + + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"all\"> " + + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,follow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,follow\"> " + + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + + " some text" + "</body></html>", + + }; + + public static final boolean[][] answers = { { true, true, true }, // NONE + { false, false, true }, // all + { true, true, true }, // nOnE + { true, true, false }, // none + { true, true, false }, // noindex,nofollow + { true, false, false }, // noindex,follow + { false, true, false }, // index,nofollow + { false, false, false }, // index,follow + { false, false, false }, // missing! + }; + + private URL[][] currURLsAndAnswers; + + @Test + public void testRobotsMetaProcessor() { + DOMFragmentParser parser = new DOMFragmentParser(); + ; + + try { + currURLsAndAnswers = new URL[][] { + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/") }, + { new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/") } }; + } catch (Exception e) { + Assert.assertTrue("couldn't make test URLs!", false); + } + + for (int i = 0; i < tests.length; i++) { + byte[] bytes = tests[i].getBytes(); + + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + + try { + parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); + } catch (Exception e) { + e.printStackTrace(); + } + + HTMLMetaTags robotsMeta = new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); + + Assert.assertTrue("got index wrong on test " + i, + robotsMeta.getNoIndex() == answers[i][0]); + Assert.assertTrue("got follow wrong on test " + i, + robotsMeta.getNoFollow() == answers[i][1]); + Assert.assertTrue("got cache wrong on test " + i, + robotsMeta.getNoCache() == answers[i][2]); + Assert + .assertTrue( + "got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) + || ((robotsMeta.getBaseHref() != null) && robotsMeta + .getBaseHref().equals(currURLsAndAnswers[i][1]))); + + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java deleted file mode 100644 index 15725ae..0000000 --- a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java +++ /dev/null @@ -1,347 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.html; - -import org.apache.nutch.parse.Outlink; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; - -import java.io.ByteArrayInputStream; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.StringTokenizer; - -import org.cyberneko.html.parsers.*; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.xml.sax.*; -import org.w3c.dom.*; -import org.apache.html.dom.*; - -/** - * Unit tests for DOMContentUtils. - */ -public class TestDOMContentUtils { - - private static final String[] testPages = { - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"http://www.nutch.org\">" - + " anchor </a><!--comment-->" + "</body></html>"), - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" - + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" - + "</body></html>"), - new String("<html><head><title> </title>" + "</head><body> " - + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" - + "</a></a>" + "</body></html>"), - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ... <li> <a href=/> home </a> </li> - // <li> <a href=/> <a href="1"> 1 </a> </a> </li> - // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> - new String("<html><head><title> my title </title>" - + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" - + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" - + "</body></html>"), - // test frameset link extraction. The invalid frame in the middle will be - // fixed to a third standalone frame. - new String("<html><head><title> my title </title>" - + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" - + "</frame>" + "<frameset cols=\"20,*\">" - + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" - + "</frame>" + "<frame src=\"right.html\">" + "</frame>" - + "</frameset>" + "</frameset>" + "</body></html>"), - // test <area> and <iframe> link extraction + url normalization - new String( - "<html><head><title> my title </title>" - + "</head><body>" - + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" - + "<map name=\"green\">" - + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" - + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" - + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" - + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " - + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), - // test whitespace processing for plain text extraction - new String( - "<html><head>\n <title> my\t\n title\r\n </title>\n" - + " </head>\n" - + " <body>\n" - + " <h1> Whitespace\ttest </h1> \n" - + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" - + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" - + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" - + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" - + "<table>" - + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" - + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" - + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" - + "</table>put some text here<Br>and there." - + "<h2>End\tthis\rmadness\n!</h2>\r\n" - + " . . . ." + "</body> </html>"), - - // test that <a rel=nofollow> links are not returned - new String("<html><head></head><body>" - + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" - + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" - + "</body></html>"), - // test that POST form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // test that all form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" - + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" - + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" - + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\"g\"><!--no anchor--></a>" - + "<a href=\"g1\"> <!--whitespace--> </a>" - + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>" - + "</body></html>"), }; - - private static int SKIP = 9; - - private static String[] testBaseHrefs = { "http://www.nutch.org", - "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", - "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", - "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", - "http://www.nutch.org//", "http://www.nutch.org/", - "http://www.nutch.org/", "http://www.nutch.org/", - "http://www.nutch.org/;something", "http://www.nutch.org/" }; - - private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; - - private static URL[] testBaseHrefURLs = new URL[testPages.length]; - - private static final String[] answerText = { - "title body anchor", - "title body home bots", - "separate this from this", - "my title body home 1 2", - "my title", - "my title the bottom", - "my title Whitespace test whitespace test " - + "This is a whitespace test . Newlines should appear as space too. " - + "Tabs are spaces too. This is a break -> and the line after break . " - + "one two three space here space there no space " - + "one two two three three four put some text here and there. " - + "End this madness ! . . . .", "ignore ignore", "test1 test2", - "test1 test2", "title anchor1 anchor2 anchor3", - "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" }; - - private static final String[] answerTitle = { "title", "title", "", - "my title", "my title", "my title", "my title", "", "", "", "title", - "title", "title" }; - - // note: should be in page-order - private static Outlink[][] answerOutlinks; - - private static Configuration conf; - private static DOMContentUtils utils = null; - - @Before - public void setup() { - conf = NutchConfiguration.create(); - conf.setBoolean("parser.html.form.use_action", true); - utils = new DOMContentUtils(conf); - DOMFragmentParser parser = new DOMFragmentParser(); - try { - parser - .setFeature( - "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", - true); - } catch (SAXException e) { - } - for (int i = 0; i < testPages.length; i++) { - DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); - try { - parser.parse( - new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), - node); - testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); - } catch (Exception e) { - Assert.assertTrue("caught exception: " + e, false); - } - testDOMs[i] = node; - } - try { - answerOutlinks = new Outlink[][] { - { new Outlink("http://www.nutch.org", "anchor"), }, - { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, - { new Outlink("http://www.nutch.org/", "separate this"), - new Outlink("http://www.nutch.org/docs/ok", "from this"), }, - { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/1", "1"), - new Outlink("http://www.nutch.org/docs/2", "2"), }, - { new Outlink("http://www.nutch.org/frames/top.html", ""), - new Outlink("http://www.nutch.org/frames/left.html", ""), - new Outlink("http://www.nutch.org/frames/invalid.html", ""), - new Outlink("http://www.nutch.org/frames/right.html", ""), }, - { new Outlink("http://www.nutch.org/maps/logo.gif", ""), - new Outlink("http://www.nutch.org/index.html", ""), - new Outlink("http://www.nutch.org/maps/#bottom", ""), - new Outlink("http://www.nutch.org/bot.html", ""), - new Outlink("http://www.nutch.org/docs/index.html", ""), }, - { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, - {}, - { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, - {}, - { new Outlink("http://www.nutch.org/;x", "anchor1"), - new Outlink("http://www.nutch.org/g;x", "anchor2"), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, - { - // this is tricky - see RFC3986 section 5.4.1 example 7 - new Outlink("http://www.nutch.org/g", "anchor1"), - new Outlink("http://www.nutch.org/g?y#s", "anchor2"), - new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), - new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), - new Outlink("http://www.nutch.org/;something?y=1;somethingelse", - "anchor5") }, - { new Outlink("http://www.nutch.org/g", ""), - new Outlink("http://www.nutch.org/g1", ""), - new Outlink("http://www.nutch.org/g2", "bla bla"), - new Outlink("http://www.nutch.org/test.gif", "bla bla"), } }; - - } catch (MalformedURLException e) { - - } - } - - private static boolean equalsIgnoreWhitespace(String s1, String s2) { - StringTokenizer st1 = new StringTokenizer(s1); - StringTokenizer st2 = new StringTokenizer(s2); - - while (st1.hasMoreTokens()) { - if (!st2.hasMoreTokens()) - return false; - if (!st1.nextToken().equals(st2.nextToken())) - return false; - } - if (st2.hasMoreTokens()) - return false; - return true; - } - - @Test - public void testGetText() { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - StringBuffer sb = new StringBuffer(); - utils.getText(sb, testDOMs[i]); - String text = sb.toString(); - Assert.assertTrue( - "expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") + "got text: " + text, - equalsIgnoreWhitespace(answerText[i], text)); - } - } - - @Test - public void testGetTitle() { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - StringBuffer sb = new StringBuffer(); - utils.getTitle(sb, testDOMs[i]); - String text = sb.toString(); - Assert.assertTrue( - "expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") + "got text: " + text, - equalsIgnoreWhitespace(answerTitle[i], text)); - } - } - - @Test - public void testGetOutlinks() { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); - if (i == SKIP) { - conf.setBoolean("parser.html.form.use_action", false); - utils.setConf(conf); - } else { - conf.setBoolean("parser.html.form.use_action", true); - utils.setConf(conf); - } - utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); - Outlink[] outlinkArr = new Outlink[outlinks.size()]; - outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr); - compareOutlinks(answerOutlinks[i], outlinkArr); - } - } - - private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { - for (int i = 0; i < o.length; i++) { - sb.append(o[i].toString()); - sb.append(System.getProperty("line.separator")); - } - } - - private static final String outlinksString(Outlink[] o) { - StringBuffer sb = new StringBuffer(); - appendOutlinks(sb, o); - return sb.toString(); - } - - private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { - if (o1.length != o2.length) { - Assert.assertTrue( - "got wrong number of outlinks (expecting " + o1.length + ", got " - + o2.length + ")" + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + outlinksString(o1) + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + outlinksString(o2) + System.getProperty("line.separator"), - false); - } - - for (int i = 0; i < o1.length; i++) { - if (!o1[i].equals(o2[i])) { - Assert.assertTrue( - "got wrong outlinks at position " + i - + System.getProperty("line.separator") + "answer: " - + System.getProperty("line.separator") + "'" + o1[i].getToUrl() - + "', anchor: '" + o1[i].getAnchor() + "'" - + System.getProperty("line.separator") + "got: " - + System.getProperty("line.separator") + "'" + o2[i].getToUrl() - + "', anchor: '" + o2[i].getAnchor() + "'", false); - - } - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java deleted file mode 100644 index 7099f50..0000000 --- a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.html; - -import java.nio.charset.Charset; - -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.html.HtmlParser; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.Parser; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class TestHtmlParser { - - public static final Logger LOG = LoggerFactory - .getLogger(TestHtmlParser.class); - - private static final String encodingTestKeywords = "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά"; - private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>"; - private static final String encodingTestContent = "<title>" - + encodingTestKeywords + "</title>\n" - + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n" - + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; - - private static String[][] encodingTestPages = { - { - "HTML4, utf-8, meta http-equiv, no quotes", - "utf-8", - "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " - + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" - + "<html>\n<head>\n" - + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />" - + encodingTestContent }, - { - "HTML4, utf-8, meta http-equiv, single quotes", - "utf-8", - "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " - + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" - + "<html>\n<head>\n" - + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" - + encodingTestContent }, - { - "XHTML, utf-8, meta http-equiv, double quotes", - "utf-8", - "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" - + "<html>\n<head>\n" - + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" - + encodingTestContent }, - { - "HTML5, utf-8, meta charset", - "utf-8", - "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">" - + encodingTestContent }, - { "HTML5, utf-8, BOM", "utf-8", - "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent }, - { "HTML5, utf-16, BOM", "utf-16", - "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } }; - - private Configuration conf; - private Parser parser; - - public TestHtmlParser() { - conf = NutchConfiguration.create(); - parser = new HtmlParser(); - parser.setConf(conf); - } - - protected Parse parse(byte[] contentBytes) { - String dummyUrl = "http://dummy.url/"; - return parser.getParse( - new Content(dummyUrl, dummyUrl, contentBytes, "text/html", - new Metadata(), conf)).get(dummyUrl); - } - - @Test - public void testEncodingDetection() { - for (String[] testPage : encodingTestPages) { - String name = testPage[0]; - Charset charset = Charset.forName(testPage[1]); - byte[] contentBytes = testPage[2].getBytes(charset); - Parse parse = parse(contentBytes); - String text = parse.getText(); - String title = parse.getData().getTitle(); - String keywords = parse.getData().getMeta("keywords"); - LOG.info(name); - LOG.info("title:\t" + title); - LOG.info("keywords:\t" + keywords); - LOG.info("text:\t" + text); - Assert.assertEquals("Title not extracted properly (" + name + ")", - encodingTestKeywords, title); - for (String keyword : encodingTestKeywords.split(",\\s*")) { - Assert.assertTrue(keyword + " not found in text (" + name + ")", - text.contains(keyword)); - } - Assert.assertNotNull("No keywords extracted", keywords); - Assert.assertEquals("Keywords not extracted properly (" + name + ")", - encodingTestKeywords, keywords); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java deleted file mode 100644 index 5089a10..0000000 --- a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java +++ /dev/null @@ -1,155 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.html; - -import org.apache.nutch.parse.HTMLMetaTags; - -import java.io.ByteArrayInputStream; -import java.net.URL; - -import org.cyberneko.html.parsers.*; -import org.junit.Assert; -import org.junit.Test; -import org.xml.sax.*; -import org.w3c.dom.*; -import org.apache.html.dom.*; - -/** Unit tests for HTMLMetaProcessor. */ -public class TestRobotsMetaProcessor { - - /* - * - * some sample tags: - * - * <meta name="robots" content="index,follow"> <meta name="robots" - * content="noindex,follow"> <meta name="robots" content="index,nofollow"> - * <meta name="robots" content="noindex,nofollow"> - * - * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> - */ - - public static String[] tests = { - "<html><head><title>test page</title>" - + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " - + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"all\"> " - + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " - + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" - + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,nofollow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,follow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,nofollow\"> " - + "</head><body>" + " some text" + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,follow\"> " - + "<base href=\"http://www.nutch.org/\">" + "</head><body>" - + " some text" + "</body></html>", - - "<html><head><title>test page</title>" + "<meta name=\"robots\"> " - + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" - + " some text" + "</body></html>", - - }; - - public static final boolean[][] answers = { { true, true, true }, // NONE - { false, false, true }, // all - { true, true, true }, // nOnE - { true, true, false }, // none - { true, true, false }, // noindex,nofollow - { true, false, false }, // noindex,follow - { false, true, false }, // index,nofollow - { false, false, false }, // index,follow - { false, false, false }, // missing! - }; - - private URL[][] currURLsAndAnswers; - - @Test - public void testRobotsMetaProcessor() { - DOMFragmentParser parser = new DOMFragmentParser(); - ; - - try { - currURLsAndAnswers = new URL[][] { - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org"), null }, - { new URL("http://www.nutch.org/foo/"), - new URL("http://www.nutch.org/") }, - { new URL("http://www.nutch.org"), - new URL("http://www.nutch.org/base/") } }; - } catch (Exception e) { - Assert.assertTrue("couldn't make test URLs!", false); - } - - for (int i = 0; i < tests.length; i++) { - byte[] bytes = tests[i].getBytes(); - - DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); - - try { - parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); - } catch (Exception e) { - e.printStackTrace(); - } - - HTMLMetaTags robotsMeta = new HTMLMetaTags(); - HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); - - Assert.assertTrue("got index wrong on test " + i, - robotsMeta.getNoIndex() == answers[i][0]); - Assert.assertTrue("got follow wrong on test " + i, - robotsMeta.getNoFollow() == answers[i][1]); - Assert.assertTrue("got cache wrong on test " + i, - robotsMeta.getNoCache() == answers[i][2]); - Assert - .assertTrue( - "got base href wrong on test " + i + " (got " - + robotsMeta.getBaseHref() + ")", - ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) - || ((robotsMeta.getBaseHref() != null) && robotsMeta - .getBaseHref().equals(currURLsAndAnswers[i][1]))); - - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java b/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java new file mode 100644 index 0000000..024aadf --- /dev/null +++ b/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.metatags; + +import java.util.Set; +import java.util.TreeSet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestMetatagParser { + + private String fileSeparator = System.getProperty("file.separator"); + private String sampleDir = System.getProperty("test.data", "."); + private String sampleFile = "testMetatags.html"; + private String sampleFileMultival = "testMultivalueMetatags.html"; + private String description = "This is a test of description"; + private String keywords = "This is a test of keywords"; + + public Metadata parseMeta(String fileName, Configuration conf) { + Metadata metadata = null; + try { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + metadata = parse.getData().getParseMeta(); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.toString()); + } + return metadata; + } + + @Test + /** test defaults: keywords and description */ + public void testIt() { + Configuration conf = NutchConfiguration.create(); + + // check that we get the same values + Metadata parseMeta = parseMeta(sampleFile, conf); + + Assert.assertEquals(description, parseMeta.get("metatag.description")); + Assert.assertEquals(keywords, parseMeta.get("metatag.keywords")); + } + + @Test + /** test multiple metatags resulting in metadata with multiple values */ + public void testMultiValueMetatags() { + Configuration conf = NutchConfiguration.create(); + conf.set("metatags.names", "keywords,DC.creator"); + conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator"); + + Metadata parseMeta = parseMeta(sampleFileMultival, conf); + + String failMessage = "One value of metatag with multiple values is missing: "; + + Set<String> valueSet = new TreeSet<String>(); + for (String val : parseMeta.getValues("metatag.dc.creator")) { + valueSet.add(val); + } + String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" }; + for (String val : expectedValues1) { + Assert.assertTrue(failMessage + val, valueSet.contains(val)); + } + + valueSet.clear(); + for (String val : parseMeta.getValues("metatag.keywords")) { + valueSet.add(val); + } + String[] expectedValues2 = { "robot d'indexation", "web crawler", + "Webcrawler" }; + for (String val : expectedValues2) { + Assert.assertTrue(failMessage + val, valueSet.contains(val)); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java deleted file mode 100644 index 024aadf..0000000 --- a/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.metatags; - -import java.util.Set; -import java.util.TreeSet; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestMetatagParser { - - private String fileSeparator = System.getProperty("file.separator"); - private String sampleDir = System.getProperty("test.data", "."); - private String sampleFile = "testMetatags.html"; - private String sampleFileMultival = "testMultivalueMetatags.html"; - private String description = "This is a test of description"; - private String keywords = "This is a test of keywords"; - - public Metadata parseMeta(String fileName, Configuration conf) { - Metadata metadata = null; - try { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - metadata = parse.getData().getParseMeta(); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.toString()); - } - return metadata; - } - - @Test - /** test defaults: keywords and description */ - public void testIt() { - Configuration conf = NutchConfiguration.create(); - - // check that we get the same values - Metadata parseMeta = parseMeta(sampleFile, conf); - - Assert.assertEquals(description, parseMeta.get("metatag.description")); - Assert.assertEquals(keywords, parseMeta.get("metatag.keywords")); - } - - @Test - /** test multiple metatags resulting in metadata with multiple values */ - public void testMultiValueMetatags() { - Configuration conf = NutchConfiguration.create(); - conf.set("metatags.names", "keywords,DC.creator"); - conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator"); - - Metadata parseMeta = parseMeta(sampleFileMultival, conf); - - String failMessage = "One value of metatag with multiple values is missing: "; - - Set<String> valueSet = new TreeSet<String>(); - for (String val : parseMeta.getValues("metatag.dc.creator")) { - valueSet.add(val); - } - String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" }; - for (String val : expectedValues1) { - Assert.assertTrue(failMessage + val, valueSet.contains(val)); - } - - valueSet.clear(); - for (String val : parseMeta.getValues("metatag.keywords")) { - valueSet.add(val); - } - String[] expectedValues2 = { "robot d'indexation", "web crawler", - "Webcrawler" }; - for (String val : expectedValues2) { - Assert.assertTrue(failMessage + val, valueSet.contains(val)); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java b/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java new file mode 100644 index 0000000..593d5ed --- /dev/null +++ b/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.replace; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestParseReplace { + + private String fileSeparator = System.getProperty("file.separator"); + private String sampleDir = System.getProperty("test.data", "."); + private String sampleFile = "testParseReplace.html"; + private String description = "This is a test of description"; + private String keywords = "This is a test of keywords"; + + public Metadata parseMeta(String fileName, Configuration conf) { + Metadata metadata = null; + try { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + metadata = parse.getData().getParseMeta(); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.toString()); + } + return metadata; + } + + @Test + /** test defaults: keywords and description */ + public void testIt() { + Configuration conf = NutchConfiguration.create(); + + // check that we get the same values + Metadata parseMeta = parseMeta(sampleFile, conf); + + Assert.assertEquals(description, parseMeta.get("metatag.description")); + Assert.assertEquals(keywords, parseMeta.get("metatag.keywords")); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java b/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java deleted file mode 100644 index 593d5ed..0000000 --- a/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.replace; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestParseReplace { - - private String fileSeparator = System.getProperty("file.separator"); - private String sampleDir = System.getProperty("test.data", "."); - private String sampleFile = "testParseReplace.html"; - private String description = "This is a test of description"; - private String keywords = "This is a test of keywords"; - - public Metadata parseMeta(String fileName, Configuration conf) { - Metadata metadata = null; - try { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - metadata = parse.getData().getParseMeta(); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.toString()); - } - return metadata; - } - - @Test - /** test defaults: keywords and description */ - public void testIt() { - Configuration conf = NutchConfiguration.create(); - - // check that we get the same values - Metadata parseMeta = parseMeta(sampleFile, conf); - - Assert.assertEquals(description, parseMeta.get("metatag.description")); - Assert.assertEquals(keywords, parseMeta.get("metatag.keywords")); - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java new file mode 100644 index 0000000..129b85f --- /dev/null +++ b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.swf; + +import java.io.FileInputStream; +import java.io.InputStreamReader; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.hadoop.io.Text; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for SWFParser. + */ +public class TestSWFParser { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + private String[] sampleFiles = new String[] { "test1.swf", "test2.swf", + "test3.swf" }; + private String[] sampleTexts = new String[] { "test1.txt", "test2.txt", + "test3.txt" }; + + @Test + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + Configuration conf = NutchConfiguration.create(); + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + + parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + + String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); + Assert.assertTrue(sampleTexts[i].equals(text)); + } + } + + public TestSWFParser() { + for (int i = 0; i < sampleFiles.length; i++) { + try { + // read the test string + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + + sampleTexts[i]); + StringBuffer sb = new StringBuffer(); + int len = 0; + InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); + char[] buf = new char[1024]; + while ((len = isr.read(buf)) > 0) { + sb.append(buf, 0, len); + } + isr.close(); + sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim(); + } catch (Exception e) { + e.printStackTrace(); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java b/nutch-plugins/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java deleted file mode 100644 index 129b85f..0000000 --- a/nutch-plugins/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.swf; - -import java.io.FileInputStream; -import java.io.InputStreamReader; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.hadoop.io.Text; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -/** - * Unit tests for SWFParser. - */ -public class TestSWFParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - private String[] sampleFiles = new String[] { "test1.swf", "test2.swf", - "test3.swf" }; - private String[] sampleTexts = new String[] { "test1.txt", "test2.txt", - "test3.txt" }; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - Configuration conf = NutchConfiguration.create(); - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - - parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - - String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); - Assert.assertTrue(sampleTexts[i].equals(text)); - } - } - - public TestSWFParser() { - for (int i = 0; i < sampleFiles.length; i++) { - try { - // read the test string - FileInputStream fis = new FileInputStream(sampleDir + fileSeparator - + sampleTexts[i]); - StringBuffer sb = new StringBuffer(); - int len = 0; - InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); - char[] buf = new char[1024]; - while ((len = isr.read(buf)) > 0) { - sb.append(buf, 0, len); - } - isr.close(); - sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim(); - } catch (Exception e) { - e.printStackTrace(); - } - } - } - -}
