http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java new file mode 100644 index 0000000..15725ae --- /dev/null +++ b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java @@ -0,0 +1,347 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import org.apache.nutch.parse.Outlink; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.ByteArrayInputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import org.cyberneko.html.parsers.*; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; + +/** + * Unit tests for DOMContentUtils. + */ +public class TestDOMContentUtils { + + private static final String[] testPages = { + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + "</body></html>"), + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" + + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" + + "</body></html>"), + new String("<html><head><title> </title>" + "</head><body> " + + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" + + "</a></a>" + "</body></html>"), + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" + + "</body></html>"), + // test frameset link extraction. The invalid frame in the middle will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" + + "</frame>" + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" + + "</frame>" + "<frame src=\"right.html\">" + "</frame>" + + "</frameset>" + "</frameset>" + "</body></html>"), + // test <area> and <iframe> link extraction + url normalization + new String( + "<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), + // test whitespace processing for plain text extraction + new String( + "<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\"><!--no anchor--></a>" + + "<a href=\"g1\"> <!--whitespace--> </a>" + + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>" + + "</body></html>"), }; + + private static int SKIP = 9; + + private static String[] testBaseHrefs = { "http://www.nutch.org", + "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", "http://www.nutch.org/", + "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/;something", "http://www.nutch.org/" }; + + private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; + + private static URL[] testBaseHrefURLs = new URL[testPages.length]; + + private static final String[] answerText = { + "title body anchor", + "title body home bots", + "separate this from this", + "my title body home 1 2", + "my title", + "my title the bottom", + "my title Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" }; + + private static final String[] answerTitle = { "title", "title", "", + "my title", "my title", "my title", "my title", "", "", "", "title", + "title", "title" }; + + // note: should be in page-order + private static Outlink[][] answerOutlinks; + + private static Configuration conf; + private static DOMContentUtils utils = null; + + @Before + public void setup() { + conf = NutchConfiguration.create(); + conf.setBoolean("parser.html.form.use_action", true); + utils = new DOMContentUtils(conf); + DOMFragmentParser parser = new DOMFragmentParser(); + try { + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + } catch (SAXException e) { + } + for (int i = 0; i < testPages.length; i++) { + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + try { + parser.parse( + new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), + node); + testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); + } catch (Exception e) { + Assert.assertTrue("caught exception: " + e, false); + } + testDOMs[i] = node; + } + try { + answerOutlinks = new Outlink[][] { + { new Outlink("http://www.nutch.org", "anchor"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, + { new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/ok", "from this"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, + { new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, + { new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", ""), }, + { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + {}, + { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, + {}, + { new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, + { + // this is tricky - see RFC3986 section 5.4.1 example 7 + new Outlink("http://www.nutch.org/g", "anchor1"), + new Outlink("http://www.nutch.org/g?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/;something?y=1;somethingelse", + "anchor5") }, + { new Outlink("http://www.nutch.org/g", ""), + new Outlink("http://www.nutch.org/g1", ""), + new Outlink("http://www.nutch.org/g2", "bla bla"), + new Outlink("http://www.nutch.org/test.gif", "bla bla"), } }; + + } catch (MalformedURLException e) { + + } + } + + private static boolean equalsIgnoreWhitespace(String s1, String s2) { + StringTokenizer st1 = new StringTokenizer(s1); + StringTokenizer st2 = new StringTokenizer(s2); + + while (st1.hasMoreTokens()) { + if (!st2.hasMoreTokens()) + return false; + if (!st1.nextToken().equals(st2.nextToken())) + return false; + } + if (st2.hasMoreTokens()) + return false; + return true; + } + + @Test + public void testGetText() { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getText(sb, testDOMs[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerText[i], text)); + } + } + + @Test + public void testGetTitle() { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getTitle(sb, testDOMs[i]); + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerTitle[i], text)); + } + } + + @Test + public void testGetOutlinks() { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); + if (i == SKIP) { + conf.setBoolean("parser.html.form.use_action", false); + utils.setConf(conf); + } else { + conf.setBoolean("parser.html.form.use_action", true); + utils.setConf(conf); + } + utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); + Outlink[] outlinkArr = new Outlink[outlinks.size()]; + outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr); + compareOutlinks(answerOutlinks[i], outlinkArr); + } + } + + private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { + for (int i = 0; i < o.length; i++) { + sb.append(o[i].toString()); + sb.append(System.getProperty("line.separator")); + } + } + + private static final String outlinksString(Outlink[] o) { + StringBuffer sb = new StringBuffer(); + appendOutlinks(sb, o); + return sb.toString(); + } + + private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { + if (o1.length != o2.length) { + Assert.assertTrue( + "got wrong number of outlinks (expecting " + o1.length + ", got " + + o2.length + ")" + System.getProperty("line.separator") + + "answer: " + System.getProperty("line.separator") + + outlinksString(o1) + System.getProperty("line.separator") + + "got: " + System.getProperty("line.separator") + + outlinksString(o2) + System.getProperty("line.separator"), + false); + } + + for (int i = 0; i < o1.length; i++) { + if (!o1[i].equals(o2[i])) { + Assert.assertTrue( + "got wrong outlinks at position " + i + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + "'" + o1[i].getToUrl() + + "', anchor: '" + o1[i].getAnchor() + "'" + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + "'" + o2[i].getToUrl() + + "', anchor: '" + o2[i].getAnchor() + "'", false); + + } + } + } +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java new file mode 100644 index 0000000..7099f50 --- /dev/null +++ b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.nio.charset.Charset; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.html.HtmlParser; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestHtmlParser { + + public static final Logger LOG = LoggerFactory + .getLogger(TestHtmlParser.class); + + private static final String encodingTestKeywords = "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά"; + private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>"; + private static final String encodingTestContent = "<title>" + + encodingTestKeywords + "</title>\n" + + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n" + + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; + + private static String[][] encodingTestPages = { + { + "HTML4, utf-8, meta http-equiv, no quotes", + "utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />" + + encodingTestContent }, + { + "HTML4, utf-8, meta http-equiv, single quotes", + "utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" + + encodingTestContent }, + { + "XHTML, utf-8, meta http-equiv, double quotes", + "utf-8", + "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" + + "<html>\n<head>\n" + + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" + + encodingTestContent }, + { + "HTML5, utf-8, meta charset", + "utf-8", + "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">" + + encodingTestContent }, + { "HTML5, utf-8, BOM", "utf-8", + "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent }, + { "HTML5, utf-16, BOM", "utf-16", + "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } }; + + private Configuration conf; + private Parser parser; + + public TestHtmlParser() { + conf = NutchConfiguration.create(); + parser = new HtmlParser(); + parser.setConf(conf); + } + + protected Parse parse(byte[] contentBytes) { + String dummyUrl = "http://dummy.url/"; + return parser.getParse( + new Content(dummyUrl, dummyUrl, contentBytes, "text/html", + new Metadata(), conf)).get(dummyUrl); + } + + @Test + public void testEncodingDetection() { + for (String[] testPage : encodingTestPages) { + String name = testPage[0]; + Charset charset = Charset.forName(testPage[1]); + byte[] contentBytes = testPage[2].getBytes(charset); + Parse parse = parse(contentBytes); + String text = parse.getText(); + String title = parse.getData().getTitle(); + String keywords = parse.getData().getMeta("keywords"); + LOG.info(name); + LOG.info("title:\t" + title); + LOG.info("keywords:\t" + keywords); + LOG.info("text:\t" + text); + Assert.assertEquals("Title not extracted properly (" + name + ")", + encodingTestKeywords, title); + for (String keyword : encodingTestKeywords.split(",\\s*")) { + Assert.assertTrue(keyword + " not found in text (" + name + ")", + text.contains(keyword)); + } + Assert.assertNotNull("No keywords extracted", keywords); + Assert.assertEquals("Keywords not extracted properly (" + name + ")", + encodingTestKeywords, keywords); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java new file mode 100644 index 0000000..5089a10 --- /dev/null +++ b/nutch-plugins/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import org.apache.nutch.parse.HTMLMetaTags; + +import java.io.ByteArrayInputStream; +import java.net.URL; + +import org.cyberneko.html.parsers.*; +import org.junit.Assert; +import org.junit.Test; +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; + +/** Unit tests for HTMLMetaProcessor. */ +public class TestRobotsMetaProcessor { + + /* + * + * some sample tags: + * + * <meta name="robots" content="index,follow"> <meta name="robots" + * content="noindex,follow"> <meta name="robots" content="index,nofollow"> + * <meta name="robots" content="noindex,nofollow"> + * + * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> + */ + + public static String[] tests = { + "<html><head><title>test page</title>" + + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"all\"> " + + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,follow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,follow\"> " + + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + + " some text" + "</body></html>", + + }; + + public static final boolean[][] answers = { { true, true, true }, // NONE + { false, false, true }, // all + { true, true, true }, // nOnE + { true, true, false }, // none + { true, true, false }, // noindex,nofollow + { true, false, false }, // noindex,follow + { false, true, false }, // index,nofollow + { false, false, false }, // index,follow + { false, false, false }, // missing! + }; + + private URL[][] currURLsAndAnswers; + + @Test + public void testRobotsMetaProcessor() { + DOMFragmentParser parser = new DOMFragmentParser(); + ; + + try { + currURLsAndAnswers = new URL[][] { + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/") }, + { new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/") } }; + } catch (Exception e) { + Assert.assertTrue("couldn't make test URLs!", false); + } + + for (int i = 0; i < tests.length; i++) { + byte[] bytes = tests[i].getBytes(); + + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + + try { + parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); + } catch (Exception e) { + e.printStackTrace(); + } + + HTMLMetaTags robotsMeta = new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); + + Assert.assertTrue("got index wrong on test " + i, + robotsMeta.getNoIndex() == answers[i][0]); + Assert.assertTrue("got follow wrong on test " + i, + robotsMeta.getNoFollow() == answers[i][1]); + Assert.assertTrue("got cache wrong on test " + i, + robotsMeta.getNoCache() == answers[i][2]); + Assert + .assertTrue( + "got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) + || ((robotsMeta.getBaseHref() != null) && robotsMeta + .getBaseHref().equals(currURLsAndAnswers[i][1]))); + + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-js/build.xml b/nutch-plugins/parse-js/build.xml new file mode 100644 index 0000000..d9c2146 --- /dev/null +++ b/nutch-plugins/parse-js/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-js" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-js/ivy.xml b/nutch-plugins/parse-js/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/parse-js/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-js/plugin.xml b/nutch-plugins/parse-js/plugin.xml new file mode 100644 index 0000000..9c06c2a --- /dev/null +++ b/nutch-plugins/parse-js/plugin.xml @@ -0,0 +1,53 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-js" + name="JavaScript Parser" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parse-js.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.parse.js" + name="JS Parser" + point="org.apache.nutch.parse.Parser"> + <implementation id="JSParser" + class="org.apache.nutch.parse.js.JSParseFilter"> + <parameter name="contentType" value="application/x-javascript"/> + <parameter name="pathSuffix" value="js"/> + </implementation> + </extension> + <extension id="org.apache.nutch.parse.js.JSParseFilter" + name="Parse JS Filter" + point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="JSParseFilter" + class="org.apache.nutch.parse.js.JSParseFilter"> + <parameter name="contentType" value="application/x-javascript"/> + <parameter name="pathSuffix" value=""/> + </implementation> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-js/pom.xml b/nutch-plugins/parse-js/pom.xml new file mode 100644 index 0000000..68d5770 --- /dev/null +++ b/nutch-plugins/parse-js/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parse-js</artifactId> + <packaging>jar</packaging> + + <name>parse-js</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java new file mode 100644 index 0000000..8c95372 --- /dev/null +++ b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java @@ -0,0 +1,301 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.js; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.Pattern; +import org.apache.oro.text.regex.PatternCompiler; +import org.apache.oro.text.regex.PatternMatcher; +import org.apache.oro.text.regex.PatternMatcherInput; +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Perl5Matcher; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +/** + * This class is a heuristic link extractor for JavaScript files and code + * snippets. The general idea of a two-pass regex matching comes from Heritrix. + * Parts of the code come from OutlinkExtractor.java + */ +public class JSParseFilter implements HtmlParseFilter, Parser { + public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class); + + private static final int MAX_TITLE_LEN = 80; + + private Configuration conf; + + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + Parse parse = parseResult.get(content.getUrl()); + + String url = content.getBaseUrl(); + ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); + walk(doc, parse, metaTags, url, outlinks); + if (outlinks.size() > 0) { + Outlink[] old = parse.getData().getOutlinks(); + String title = parse.getData().getTitle(); + List<Outlink> list = Arrays.asList(old); + outlinks.addAll(list); + ParseStatus status = parse.getData().getStatus(); + String text = parse.getText(); + Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks + .size()]); + ParseData parseData = new ParseData(status, title, newlinks, parse + .getData().getContentMeta(), parse.getData().getParseMeta()); + + // replace original parse obj with new one + parseResult.put(content.getUrl(), new ParseText(text), parseData); + } + return parseResult; + } + + private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, + List<Outlink> outlinks) { + if (n instanceof Element) { + String name = n.getNodeName(); + if (name.equalsIgnoreCase("script")) { + /* + * String lang = null; Node lNode = + * n.getAttributes().getNamedItem("language"); if (lNode == null) lang = + * "javascript"; else lang = lNode.getNodeValue(); + */ + StringBuffer script = new StringBuffer(); + NodeList nn = n.getChildNodes(); + if (nn.getLength() > 0) { + for (int i = 0; i < nn.getLength(); i++) { + if (i > 0) + script.append('\n'); + script.append(nn.item(i).getNodeValue()); + } + // if (LOG.isInfoEnabled()) { + // LOG.info("script: language=" + lang + ", text: " + + // script.toString()); + // } + Outlink[] links = getJSLinks(script.toString(), "", base); + if (links != null && links.length > 0) + outlinks.addAll(Arrays.asList(links)); + // no other children of interest here, go one level up. + return; + } + } else { + // process all HTML 4.0 events, if present... + NamedNodeMap attrs = n.getAttributes(); + int len = attrs.getLength(); + for (int i = 0; i < len; i++) { + // Window: onload,onunload + // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus + // Keyboard: onkeydown,onkeypress,onkeyup + // Mouse: + // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup + Node anode = attrs.item(i); + Outlink[] links = null; + if (anode.getNodeName().startsWith("on")) { + links = getJSLinks(anode.getNodeValue(), "", base); + } else if (anode.getNodeName().equalsIgnoreCase("href")) { + String val = anode.getNodeValue(); + if (val != null && val.toLowerCase().indexOf("javascript:") != -1) { + links = getJSLinks(val, "", base); + } + } + if (links != null && links.length > 0) + outlinks.addAll(Arrays.asList(links)); + } + } + } + NodeList nl = n.getChildNodes(); + for (int i = 0; i < nl.getLength(); i++) { + walk(nl.item(i), parse, metaTags, base, outlinks); + } + } + + public ParseResult getParse(Content c) { + String type = c.getContentType(); + if (type != null && !type.trim().equals("") + && !type.toLowerCase().startsWith("application/x-javascript")) + return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT, + "Content not JavaScript: '" + type + "'").getEmptyParseResult( + c.getUrl(), getConf()); + String script = new String(c.getContent()); + Outlink[] outlinks = getJSLinks(script, "", c.getUrl()); + if (outlinks == null) + outlinks = new Outlink[0]; + // Title? use the first line of the script... + String title; + int idx = script.indexOf('\n'); + if (idx != -1) { + if (idx > MAX_TITLE_LEN) + idx = MAX_TITLE_LEN; + title = script.substring(0, idx); + } else { + idx = Math.min(MAX_TITLE_LEN, script.length()); + title = script.substring(0, idx); + } + ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, + c.getMetadata()); + return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd)); + } + + private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)"; + // A simple pattern. This allows also invalid URL characters. + private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)"; + + // Alternative pattern, which limits valid url characters. + // private static final String URI_PATTERN = + // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)"; + + /** + * This method extracts URLs from literals embedded in JavaScript. + */ + private Outlink[] getJSLinks(String plainText, String anchor, String base) { + + final List<Outlink> outlinks = new ArrayList<Outlink>(); + URL baseURL = null; + + try { + baseURL = new URL(base); + } catch (Exception e) { + if (LOG.isErrorEnabled()) { + LOG.error("getJSLinks", e); + } + } + + try { + final PatternCompiler cp = new Perl5Compiler(); + final Pattern pattern = cp.compile(STRING_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final Pattern pattern1 = cp.compile(URI_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final PatternMatcher matcher = new Perl5Matcher(); + + final PatternMatcher matcher1 = new Perl5Matcher(); + final PatternMatcherInput input = new PatternMatcherInput(plainText); + + MatchResult result; + String url; + + // loop the matches + while (matcher.contains(input, pattern)) { + result = matcher.getMatch(); + url = result.group(2); + PatternMatcherInput input1 = new PatternMatcherInput(url); + if (!matcher1.matches(input1, pattern1)) { + // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); + // } + continue; + } + if (url.startsWith("www.")) { + url = "http://" + url; + } else { + // See if candidate URL is parseable. If not, pass and move on to + // the next match. + try { + url = new URL(baseURL, url).toString(); + } catch (MalformedURLException ex) { + if (LOG.isTraceEnabled()) { + LOG.trace(" - failed URL parse '" + url + "' and baseURL '" + + baseURL + "'", ex); + } + continue; + } + } + url = url.replaceAll("&", "&"); + if (LOG.isTraceEnabled()) { + LOG.trace(" - outlink from JS: '" + url + "'"); + } + outlinks.add(new Outlink(url, anchor)); + } + } catch (Exception ex) { + // if it is a malformed URL we just throw it away and continue with + // extraction. + if (LOG.isErrorEnabled()) { + LOG.error("getJSLinks", ex); + } + } + + final Outlink[] retval; + + // create array of the Outlinks + if (outlinks != null && outlinks.size() > 0) { + retval = (Outlink[]) outlinks.toArray(new Outlink[0]); + } else { + retval = new Outlink[0]; + } + + return retval; + } + + public static void main(String[] args) throws Exception { + if (args.length < 2) { + System.err.println(JSParseFilter.class.getName() + " file.js baseURL"); + return; + } + InputStream in = new FileInputStream(args[0]); + BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); + StringBuffer sb = new StringBuffer(); + String line = null; + while ((line = br.readLine()) != null) + sb.append(line + "\n"); + br.close(); + + JSParseFilter parseFilter = new JSParseFilter(); + parseFilter.setConf(NutchConfiguration.create()); + Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]); + System.out.println("Outlinks extracted: " + links.length); + for (int i = 0; i < links.length; i++) + System.out.println(" - " + links[i]); + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java new file mode 100644 index 0000000..36d0d14 --- /dev/null +++ b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parser and parse filter plugin to extract all (possible) links + * from JavaScript files and embedded JavaScript code snippets. + */ +package org.apache.nutch.parse.js; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/README.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/README.txt b/nutch-plugins/parse-metatags/README.txt new file mode 100644 index 0000000..0d5b009 --- /dev/null +++ b/nutch-plugins/parse-metatags/README.txt @@ -0,0 +1,17 @@ +Parse-metatags plugin + +The parse-metatags plugin consists of a HTMLParserFilter which takes as parameter a list of metatag names with '*' as default value. The values are separated by ';'. +In order to extract the values of the metatags description and keywords, you must specify in nutch-site.xml + +<property> + <name>metatags.names</name> + <value>description;keywords</value> +</property> + +Prefixes the names with 'metatag.' in the parse-metadata. For instance to index description and keywords, you need to activate the plugin index-metadata and set the value of the parameter 'index.parse.md' to 'metatag.description;metatag.keywords'. + +This code has been developed by DigitalPebble Ltd and offered to the community by ANT.com + + + + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/build.xml b/nutch-plugins/parse-metatags/build.xml new file mode 100644 index 0000000..e30292d --- /dev/null +++ b/nutch-plugins/parse-metatags/build.xml @@ -0,0 +1,37 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-metatags" default="jar-core"> + + <import file="../build-plugin.xml" /> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" /> + <ant target="deploy" inheritall="false" dir="../protocol-file" /> + </target> + + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.html" /> + </fileset> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/ivy.xml b/nutch-plugins/parse-metatags/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/parse-metatags/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/plugin.xml b/nutch-plugins/parse-metatags/plugin.xml new file mode 100644 index 0000000..07933fa --- /dev/null +++ b/nutch-plugins/parse-metatags/plugin.xml @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="parse-metatags" + name="MetaTags" + version="1.0" + provider-name="digitalpebble.com"> + + <runtime> + <library name="parse-metatags.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.parse.metatags.parser" + name="MetaTags Parser" + point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="MetaTagsParser" + class="org.apache.nutch.parse.metatags.MetaTagsParser"/> + </extension> + +</plugin> + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/pom.xml b/nutch-plugins/parse-metatags/pom.xml new file mode 100644 index 0000000..e96d404 --- /dev/null +++ b/nutch-plugins/parse-metatags/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parse-metatags</artifactId> + <packaging>jar</packaging> + + <name>parse-metatags</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/sample/testMetatags.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/sample/testMetatags.html b/nutch-plugins/parse-metatags/sample/testMetatags.html new file mode 100644 index 0000000..e9e8e6b --- /dev/null +++ b/nutch-plugins/parse-metatags/sample/testMetatags.html @@ -0,0 +1,9 @@ +<html> +<head> +<meta name="Keywords" content="This is a test of keywords" /> +<meta name="Description" content="This is a test of description" /> +</head> +<body> +text of the document +</body> + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html b/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html new file mode 100644 index 0000000..ca8b737 --- /dev/null +++ b/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html @@ -0,0 +1,12 @@ +<html> +<head> +<meta name="DC.creator" content="Doug Cutting"> +<meta name="DC.creator" content="Michael Cafarella"> +<!-- meta keywords in different casing --> +<meta name="keywords" lang="en" content="web crawler" /> +<meta name="Keywords" lang="fr" content="robot d'indexation" /> +<meta name="KEYWORDS" lang="de" content="Webcrawler" /> +</head> +<body> +A test for multi-valued metatags. +</body> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java new file mode 100644 index 0000000..f9b9722 --- /dev/null +++ b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.metatags; + +import java.util.Enumeration; +import java.util.HashSet; +import java.util.Locale; +import java.util.Properties; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.protocol.Content; +import org.w3c.dom.DocumentFragment; + +/** + * Parse HTML meta tags (keywords, description) and store them in the parse + * metadata so that they can be indexed with the index-metadata plugin with the + * prefix 'metatag.'. Metatags are matched ignoring case. + */ +public class MetaTagsParser implements HtmlParseFilter { + + private static final Log LOG = LogFactory.getLog(MetaTagsParser.class + .getName()); + + private Configuration conf; + + private Set<String> metatagset = new HashSet<String>(); + + public void setConf(Configuration conf) { + this.conf = conf; + // specify whether we want a specific subset of metadata + // by default take everything we can find + String[] values = conf.getStrings("metatags.names", "*"); + for (String val : values) { + metatagset.add(val.toLowerCase(Locale.ROOT)); + } + } + + public Configuration getConf() { + return this.conf; + } + + /** + * Check whether the metatag is in the list of metatags to be indexed (or if + * '*' is specified). If yes, add it to parse metadata. + */ + private void addIndexedMetatags(Metadata metadata, String metatag, + String value) { + String lcMetatag = metatag.toLowerCase(Locale.ROOT); + if (metatagset.contains("*") || metatagset.contains(lcMetatag)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Found meta tag: " + lcMetatag + "\t" + value); + } + metadata.add("metatag." + lcMetatag, value); + } + } + + /** + * Check whether the metatag is in the list of metatags to be indexed (or if + * '*' is specified). If yes, add it with all values to parse metadata. + */ + private void addIndexedMetatags(Metadata metadata, String metatag, + String[] values) { + String lcMetatag = metatag.toLowerCase(Locale.ROOT); + if (metatagset.contains("*") || metatagset.contains(lcMetatag)) { + for (String value : values) { + if (LOG.isDebugEnabled()) { + LOG.debug("Found meta tag: " + lcMetatag + "\t" + value); + } + metadata.add("metatag." + lcMetatag, value); + } + } + } + + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + Parse parse = parseResult.get(content.getUrl()); + Metadata metadata = parse.getData().getParseMeta(); + + // check in the metadata first : the tika-parser + // might have stored the values there already + for (String mdName : metadata.names()) { + addIndexedMetatags(metadata, mdName, metadata.getValues(mdName)); + } + + Metadata generalMetaTags = metaTags.getGeneralTags(); + for (String tagName : generalMetaTags.names()) { + addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName)); + } + + Properties httpequiv = metaTags.getHttpEquivTags(); + for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames + .hasMoreElements();) { + String name = (String) tagNames.nextElement(); + String value = httpequiv.getProperty(name); + addIndexedMetatags(metadata, name, value); + } + + return parseResult; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java new file mode 100644 index 0000000..a55cf5c --- /dev/null +++ b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse filter to extract meta tags: keywords, description, etc. + * Used in combination with index-metadata plugin + * (see {@link org.apache.nutch.indexer.metadata}). + */ +package org.apache.nutch.parse.metatags; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java new file mode 100644 index 0000000..024aadf --- /dev/null +++ b/nutch-plugins/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.metatags; + +import java.util.Set; +import java.util.TreeSet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestMetatagParser { + + private String fileSeparator = System.getProperty("file.separator"); + private String sampleDir = System.getProperty("test.data", "."); + private String sampleFile = "testMetatags.html"; + private String sampleFileMultival = "testMultivalueMetatags.html"; + private String description = "This is a test of description"; + private String keywords = "This is a test of keywords"; + + public Metadata parseMeta(String fileName, Configuration conf) { + Metadata metadata = null; + try { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + metadata = parse.getData().getParseMeta(); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.toString()); + } + return metadata; + } + + @Test + /** test defaults: keywords and description */ + public void testIt() { + Configuration conf = NutchConfiguration.create(); + + // check that we get the same values + Metadata parseMeta = parseMeta(sampleFile, conf); + + Assert.assertEquals(description, parseMeta.get("metatag.description")); + Assert.assertEquals(keywords, parseMeta.get("metatag.keywords")); + } + + @Test + /** test multiple metatags resulting in metadata with multiple values */ + public void testMultiValueMetatags() { + Configuration conf = NutchConfiguration.create(); + conf.set("metatags.names", "keywords,DC.creator"); + conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator"); + + Metadata parseMeta = parseMeta(sampleFileMultival, conf); + + String failMessage = "One value of metatag with multiple values is missing: "; + + Set<String> valueSet = new TreeSet<String>(); + for (String val : parseMeta.getValues("metatag.dc.creator")) { + valueSet.add(val); + } + String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" }; + for (String val : expectedValues1) { + Assert.assertTrue(failMessage + val, valueSet.contains(val)); + } + + valueSet.clear(); + for (String val : parseMeta.getValues("metatag.keywords")) { + valueSet.add(val); + } + String[] expectedValues2 = { "robot d'indexation", "web crawler", + "Webcrawler" }; + for (String val : expectedValues2) { + Assert.assertTrue(failMessage + val, valueSet.contains(val)); + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/README.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/README.txt b/nutch-plugins/parse-replace/README.txt new file mode 100644 index 0000000..a18bd9c --- /dev/null +++ b/nutch-plugins/parse-replace/README.txt @@ -0,0 +1,91 @@ +ParseReplace plugin + +Allows post-parsing regexp replace manipulation of metadata fields. + +Configuration Example + <property> + <name>parse.replace.regexp</name> + <value> + id=/file:/http:/ + url=/file:/http:/128 + </value> + </property + +Property format: parse.replace.regexp + The format of the property is a list of regexp replacements, one line per field being + modified. Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure. + + The fieldname preceeds the equal sign. The first character after the equal sign signifies + the delimiter for the regexp, the replacement value and the flags. + +Replacement Sequence + The replacements will happen in the order listed. If a field needs multiple replacement operations + they may be listed more than once. + +RegExp Format + The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined + here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29 + Patterns are compiled when the plugin is initialized for efficiency. + +Replacement Format + The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement): + http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29 + +Flags + The flags is an integer sum of the flag values defined in + http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern) + +Escaping + Since the regexp is being read from a config file, any escaped values must be double + escaped. Eg: id=/\\s+// will cause the esacped \s+ match pattern to be used. + +Multi-valued Fields + If a field has multiple values, the replacement will be applied to each value in turn. + +Non-string Datatypes + Replacement is possible only on String field datatypes. If the field you name in the property is + not a String datatype, it will be silently ignored. + +Host and URL specifc replacements. + If the replacements should apply only to specifc pages, then add a sequence like + + hostmatch=/host match pattern/ + fld1=/regexp/replace/flags + fld2=/regexp/replace/flags + + or + urlmatch=/url match pattern/ + fld1=/regexp/replace/flags + fld2=/regexp/replace/flags + +When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch +will apply to all parsed pages. Replacements following a hostmatch or urlmatch will be applied +to pages which match the host or url field (up to the next hostmatch or urlmatch line). hostmatch +and urlmatch patterns must be unique in this property. + +Plugin order + TBD... But in most cases you will want this plugin to run last. + +Testing your match patterns + Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html + can help get the basics of your pattern working. + To test in nutch: + Prepare a test HTML file with the field contents you want to test. + Place this in a directory accessible to nutch. + Use the file:/// syntax to list the test file(s) in a test/urls seed list. + See the nutch faq "index my local file system" for conf settings you will need. + (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This + test approach confirms only how your global matches behave, unless your urlmatch and hostmatch + patterns also match the file: URL pattern) + + Run.. + bin/nutch inject crawl/crawldb test + bin/nutch generate crawl/crawldb crawl/segments + bin/nutch fetch crawl/segments/[segment] + bin/nutch parse crawl/segments/[segment] + + To inspect the returned fields... + bin/nutch readseg -dump crawl/segments/[segment] testout + less testout/dump + + To retry: delete crawl/segments/[segment]/crawl_parse and repeat the parse and dump step. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/build.xml b/nutch-plugins/parse-replace/build.xml new file mode 100644 index 0000000..ca5ccf7 --- /dev/null +++ b/nutch-plugins/parse-replace/build.xml @@ -0,0 +1,37 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-replace" default="jar-core"> + + <import file="../build-plugin.xml" /> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" /> + <ant target="deploy" inheritall="false" dir="../protocol-file" /> + </target> + + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.html" /> + </fileset> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/ivy.xml b/nutch-plugins/parse-replace/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/parse-replace/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/plugin.xml b/nutch-plugins/parse-replace/plugin.xml new file mode 100644 index 0000000..6368210 --- /dev/null +++ b/nutch-plugins/parse-replace/plugin.xml @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="parse-replace" + name="ReplaceParser" + version="1.0" + provider-name="PeterCiuffetti"> + + <runtime> + <library name="parse-replace.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.parse.replace.parser" + name="Replace Parser" + point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="ReplaceParser" + class="org.apache.nutch.parse.replace.ReplaceParser"/> + </extension> + +</plugin> + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/pom.xml b/nutch-plugins/parse-replace/pom.xml new file mode 100644 index 0000000..073f895 --- /dev/null +++ b/nutch-plugins/parse-replace/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parse-replace</artifactId> + <packaging>jar</packaging> + + <name>parse-replace</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/sample/testParseReplace.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/sample/testParseReplace.html b/nutch-plugins/parse-replace/sample/testParseReplace.html new file mode 100644 index 0000000..825dcb9 --- /dev/null +++ b/nutch-plugins/parse-replace/sample/testParseReplace.html @@ -0,0 +1,11 @@ +<html> + <head> + <title>Testing the power of parser-replace plugin</title> + <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!"> + <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!"> + <meta name="author" content="Peter Ciuffetti"> + </head> + <body> + <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p> + </body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java new file mode 100644 index 0000000..9773c4a --- /dev/null +++ b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java @@ -0,0 +1,74 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.replace; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.protocol.Content; +import org.w3c.dom.DocumentFragment; + +/** + * Do pattern replacements on selected field contents + * prior to indexing. + */ +public class ReplaceParser implements HtmlParseFilter { + + private static final Log LOG = LogFactory.getLog(ReplaceParser.class + .getName()); + + private static Map<String, List<Object>> REPLACEPATTERNS_BY_HOST = new HashMap(); + private static Map<String, List<Object>> REPLACEPATTERNS_BY_URL = new HashMap(); + + private Configuration conf; + + private Set<String> metatagset = new HashSet<String>(); + + public void setConf(Configuration conf) { + this.conf = conf; + String[] values = conf.getStrings("parse.replace.regexp", null); + if (values != null) { + this.parseConf(values); + } + } + + public Configuration getConf() { + return this.conf; + } + + private void parseConf(String[] values) { + + } + + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + Parse parse = parseResult.get(content.getUrl()); + + return parseResult; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java new file mode 100644 index 0000000..b678f00 --- /dev/null +++ b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse filter to allow pattern replacements on parsed metadata. + */ +package org.apache.nutch.parse.replace; + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java b/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java new file mode 100644 index 0000000..593d5ed --- /dev/null +++ b/nutch-plugins/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.replace; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Test; + +public class TestParseReplace { + + private String fileSeparator = System.getProperty("file.separator"); + private String sampleDir = System.getProperty("test.data", "."); + private String sampleFile = "testParseReplace.html"; + private String description = "This is a test of description"; + private String keywords = "This is a test of keywords"; + + public Metadata parseMeta(String fileName, Configuration conf) { + Metadata metadata = null; + try { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + metadata = parse.getData().getParseMeta(); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.toString()); + } + return metadata; + } + + @Test + /** test defaults: keywords and description */ + public void testIt() { + Configuration conf = NutchConfiguration.create(); + + // check that we get the same values + Metadata parseMeta = parseMeta(sampleFile, conf); + + Assert.assertEquals(description, parseMeta.get("metatag.description")); + Assert.assertEquals(keywords, parseMeta.get("metatag.keywords")); + } +}
