Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original) +++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Thu Jan 29 05:38:59 2015 @@ -35,189 +35,133 @@ import org.xml.sax.*; import org.w3c.dom.*; import org.apache.html.dom.*; -/** +/** * Unit tests for DOMContentUtils. */ public class TestDOMContentUtils { - private static final String[] testPages= { - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"http://www.nutch.org\">" - + " anchor </a><!--comment-->" - + "</body></html>"), - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"/\">" - + " home </a><!--comment-->" - + "<style> style </style>" - + " <a href=\"bot.html\">" - + " bots </a>" - + "</body></html>"), - new String("<html><head><title> </title>" - + "</head><body> " - + "<a href=\"/\"> separate this " - + "<a href=\"ok\"> from this" - + "</a></a>" - + "</body></html>"), - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ... <li> <a href=/> home </a> </li> - // <li> <a href=/> <a href="1"> 1 </a> </a> </li> - // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> - new String("<html><head><title> my title </title>" - + "</head><body> body " - + "<ul>" - + "<li> <a href=\"/\"> home" - + "<li> <a href=\"1\"> 1" - + "<li> <a href=\"2\"> 2" - + "</ul>" - + "</body></html>"), - // test frameset link extraction. The invalid frame in the middle will be - // fixed to a third standalone frame. - new String("<html><head><title> my title </title>" - + "</head><frameset rows=\"20,*\"> " - + "<frame src=\"top.html\">" - + "</frame>" - + "<frameset cols=\"20,*\">" - + "<frame src=\"left.html\">" - + "<frame src=\"invalid.html\"/>" - + "</frame>" - + "<frame src=\"right.html\">" - + "</frame>" - + "</frameset>" - + "</frameset>" - + "</body></html>"), - // test <area> and <iframe> link extraction + url normalization - new String("<html><head><title> my title </title>" - + "</head><body>" - + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" - + "<map name=\"green\">" - + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" - + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" - + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" - + "</map>" - + "<a name=\"bottom\"/><h1> the bottom </h1> " - + "<iframe src=\"../docs/index.html\"/>" - + "</body></html>"), - // test whitespace processing for plain text extraction - new String("<html><head>\n <title> my\t\n title\r\n </title>\n" - + " </head>\n" - + " <body>\n" - + " <h1> Whitespace\ttest </h1> \n" - + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" - + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" - + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" - + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" - + "<table>" - + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" - + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" - + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" - + "</table>put some text here<Br>and there." - + "<h2>End\tthis\rmadness\n!</h2>\r\n" - + " . . . ." - + "</body> </html>"), - - // test that <a rel=nofollow> links are not returned - new String("<html><head></head><body>" - + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" - + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" - + "</body></html>"), - // test that POST form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // test that all form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - new String("<html><head><title> title </title>" - + "</head><body>" - + "<a href=\";x\">anchor1</a>" - + "<a href=\"g;x\">anchor2</a>" - + "<a href=\"g;x?y#s\">anchor3</a>" - + "</body></html>"), - new String("<html><head><title> title </title>" - + "</head><body>" - + "<a href=\"g\">anchor1</a>" - + "<a href=\"g?y#s\">anchor2</a>" - + "<a href=\"?y=1\">anchor3</a>" - + "<a href=\"?y=1#s\">anchor4</a>" - + "<a href=\"?y=1;somethingelse\">anchor5</a>" - + "</body></html>"), - new String("<html><head><title> title </title>" - + "</head><body>" - + "<a href=\"g\"><!--no anchor--></a>" - + "<a href=\"g1\"> <!--whitespace--> </a>" - + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>" - + "</body></html>"), - }; + private static final String[] testPages = { + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + "</body></html>"), + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" + + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" + + "</body></html>"), + new String("<html><head><title> </title>" + "</head><body> " + + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" + + "</a></a>" + "</body></html>"), + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" + + "</body></html>"), + // test frameset link extraction. The invalid frame in the middle will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" + + "</frame>" + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" + + "</frame>" + "<frame src=\"right.html\">" + "</frame>" + + "</frameset>" + "</frameset>" + "</body></html>"), + // test <area> and <iframe> link extraction + url normalization + new String( + "<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), + // test whitespace processing for plain text extraction + new String( + "<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\"><!--no anchor--></a>" + + "<a href=\"g1\"> <!--whitespace--> </a>" + + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>" + + "</body></html>"), }; private static int SKIP = 9; - private static String[] testBaseHrefs= { - "http://www.nutch.org", - "http://www.nutch.org/docs/foo.html", - "http://www.nutch.org/docs/", - "http://www.nutch.org/docs/", - "http://www.nutch.org/frames/", - "http://www.nutch.org/maps/", - "http://www.nutch.org/whitespace/", - "http://www.nutch.org//", - "http://www.nutch.org/", - "http://www.nutch.org/", - "http://www.nutch.org/", - "http://www.nutch.org/;something", - "http://www.nutch.org/" - }; - - private static final DocumentFragment testDOMs[]= - new DocumentFragment[testPages.length]; - - private static URL[] testBaseHrefURLs= - new URL[testPages.length]; - - - private static final String[] answerText= { - "title body anchor", - "title body home bots", - "separate this from this", - "my title body home 1 2", - "my title", - "my title the bottom", - "my title Whitespace test whitespace test " - + "This is a whitespace test . Newlines should appear as space too. " - + "Tabs are spaces too. This is a break -> and the line after break . " - + "one two three space here space there no space " - + "one two two three three four put some text here and there. " - + "End this madness ! . . . .", - "ignore ignore", - "test1 test2", - "test1 test2", - "title anchor1 anchor2 anchor3", - "title anchor1 anchor2 anchor3 anchor4 anchor5", - "title" - }; - - private static final String[] answerTitle= { - "title", - "title", - "", - "my title", - "my title", - "my title", - "my title", - "", - "", - "", - "title", - "title", - "title" - }; + private static String[] testBaseHrefs = { "http://www.nutch.org", + "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", "http://www.nutch.org/", + "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/;something", "http://www.nutch.org/" }; + + private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; + + private static URL[] testBaseHrefURLs = new URL[testPages.length]; + + private static final String[] answerText = { + "title body anchor", + "title body home bots", + "separate this from this", + "my title body home 1 2", + "my title", + "my title the bottom", + "my title Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" }; + + private static final String[] answerTitle = { "title", "title", "", + "my title", "my title", "my title", "my title", "", "", "", "title", + "title", "title" }; // note: should be in page-order private static Outlink[][] answerOutlinks; @@ -230,87 +174,64 @@ public class TestDOMContentUtils { conf = NutchConfiguration.create(); conf.setBoolean("parser.html.form.use_action", true); utils = new DOMContentUtils(conf); - DOMFragmentParser parser= new DOMFragmentParser(); + DOMFragmentParser parser = new DOMFragmentParser(); try { - parser.setFeature( - "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", - true); - } catch (SAXException e) {} - for (int i= 0; i < testPages.length; i++) { - DocumentFragment node= - new HTMLDocumentImpl().createDocumentFragment(); + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + } catch (SAXException e) { + } + for (int i = 0; i < testPages.length; i++) { + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); try { parser.parse( - new InputSource( - new ByteArrayInputStream(testPages[i].getBytes()) ), - node); - testBaseHrefURLs[i]= new URL(testBaseHrefs[i]); + new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), + node); + testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); } catch (Exception e) { Assert.assertTrue("caught exception: " + e, false); - } - testDOMs[i]= node; + } + testDOMs[i] = node; } try { - answerOutlinks = new Outlink[][]{ - { - new Outlink("http://www.nutch.org", "anchor"), - }, - { - new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/bot.html", "bots"), - }, - { - new Outlink("http://www.nutch.org/", "separate this"), - new Outlink("http://www.nutch.org/docs/ok", "from this"), - }, - { - new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/1", "1"), - new Outlink("http://www.nutch.org/docs/2", "2"), - }, - { - new Outlink("http://www.nutch.org/frames/top.html", ""), - new Outlink("http://www.nutch.org/frames/left.html", ""), - new Outlink("http://www.nutch.org/frames/invalid.html", ""), - new Outlink("http://www.nutch.org/frames/right.html", ""), - }, - { - new Outlink("http://www.nutch.org/maps/logo.gif", ""), - new Outlink("http://www.nutch.org/index.html", ""), - new Outlink("http://www.nutch.org/maps/#bottom", ""), - new Outlink("http://www.nutch.org/bot.html", ""), - new Outlink("http://www.nutch.org/docs/index.html", ""), - }, - { - new Outlink("http://www.nutch.org/index.html", "whitespace test"), - }, - { - }, - { - new Outlink("http://www.nutch.org/dummy.jsp", "test2"), - }, - { - }, - { - new Outlink("http://www.nutch.org/;x", "anchor1"), - new Outlink("http://www.nutch.org/g;x", "anchor2"), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") - }, - { - // this is tricky - see RFC3986 section 5.4.1 example 7 - new Outlink("http://www.nutch.org/g", "anchor1"), - new Outlink("http://www.nutch.org/g?y#s", "anchor2"), - new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), - new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), - new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") - }, - { - new Outlink("http://www.nutch.org/g", ""), - new Outlink("http://www.nutch.org/g1", ""), - new Outlink("http://www.nutch.org/g2", "bla bla"), - new Outlink("http://www.nutch.org/test.gif", "bla bla"), - } - }; + answerOutlinks = new Outlink[][] { + { new Outlink("http://www.nutch.org", "anchor"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, + { new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/ok", "from this"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, + { new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, + { new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", ""), }, + { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + {}, + { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, + {}, + { new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, + { + // this is tricky - see RFC3986 section 5.4.1 example 7 + new Outlink("http://www.nutch.org/g", "anchor1"), + new Outlink("http://www.nutch.org/g?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/;something?y=1;somethingelse", + "anchor5") }, + { new Outlink("http://www.nutch.org/g", ""), + new Outlink("http://www.nutch.org/g1", ""), + new Outlink("http://www.nutch.org/g2", "bla bla"), + new Outlink("http://www.nutch.org/test.gif", "bla bla"), } }; } catch (MalformedURLException e) { @@ -318,58 +239,58 @@ public class TestDOMContentUtils { } private static boolean equalsIgnoreWhitespace(String s1, String s2) { - StringTokenizer st1= new StringTokenizer(s1); - StringTokenizer st2= new StringTokenizer(s2); + StringTokenizer st1 = new StringTokenizer(s1); + StringTokenizer st2 = new StringTokenizer(s2); while (st1.hasMoreTokens()) { - if (!st2.hasMoreTokens()) + if (!st2.hasMoreTokens()) return false; - if ( ! st1.nextToken().equals(st2.nextToken()) ) + if (!st1.nextToken().equals(st2.nextToken())) return false; } - if (st2.hasMoreTokens()) + if (st2.hasMoreTokens()) return false; return true; } @Test public void testGetText() { - if (testDOMs[0] == null) + if (testDOMs[0] == null) setup(); - for (int i= 0; i < testPages.length; i++) { - StringBuffer sb= new StringBuffer(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); utils.getText(sb, testDOMs[i]); - String text= sb.toString(); - Assert.assertTrue("expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") - + "got text: "+ text, + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, equalsIgnoreWhitespace(answerText[i], text)); } } @Test public void testGetTitle() { - if (testDOMs[0] == null) + if (testDOMs[0] == null) setup(); - for (int i= 0; i < testPages.length; i++) { - StringBuffer sb= new StringBuffer(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); utils.getTitle(sb, testDOMs[i]); - String text= sb.toString(); - Assert.assertTrue("expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") - + "got text: "+ text, + String text = sb.toString(); + Assert.assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, equalsIgnoreWhitespace(answerTitle[i], text)); } } @Test public void testGetOutlinks() { - if (testDOMs[0] == null) + if (testDOMs[0] == null) setup(); - for (int i= 0; i < testPages.length; i++) { - ArrayList<Outlink> outlinks= new ArrayList<Outlink>(); + for (int i = 0; i < testPages.length; i++) { + ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); if (i == SKIP) { conf.setBoolean("parser.html.form.use_action", false); utils.setConf(conf); @@ -378,51 +299,47 @@ public class TestDOMContentUtils { utils.setConf(conf); } utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); - Outlink[] outlinkArr= new Outlink[outlinks.size()]; - outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr); + Outlink[] outlinkArr = new Outlink[outlinks.size()]; + outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr); compareOutlinks(answerOutlinks[i], outlinkArr); } } private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { - for (int i= 0; i < o.length; i++) { + for (int i = 0; i < o.length; i++) { sb.append(o[i].toString()); sb.append(System.getProperty("line.separator")); } } private static final String outlinksString(Outlink[] o) { - StringBuffer sb= new StringBuffer(); + StringBuffer sb = new StringBuffer(); appendOutlinks(sb, o); return sb.toString(); } private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { if (o1.length != o2.length) { - Assert.assertTrue("got wrong number of outlinks (expecting " + o1.length - + ", got " + o2.length + ")" - + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + outlinksString(o1) - + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + outlinksString(o2) - + System.getProperty("line.separator"), - false - ); + Assert.assertTrue( + "got wrong number of outlinks (expecting " + o1.length + ", got " + + o2.length + ")" + System.getProperty("line.separator") + + "answer: " + System.getProperty("line.separator") + + outlinksString(o1) + System.getProperty("line.separator") + + "got: " + System.getProperty("line.separator") + + outlinksString(o2) + System.getProperty("line.separator"), + false); } - for (int i= 0; i < o1.length; i++) { + for (int i = 0; i < o1.length; i++) { if (!o1[i].equals(o2[i])) { - Assert.assertTrue("got wrong outlinks at position " + i - + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + "'" + o1[i].getToUrl() + "', anchor: '" + o1[i].getAnchor() + "'" - + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + "'" + o2[i].getToUrl() + "', anchor: '" + o2[i].getAnchor() + "'", - false - ); + Assert.assertTrue( + "got wrong outlinks at position " + i + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + "'" + o1[i].getToUrl() + + "', anchor: '" + o1[i].getAnchor() + "'" + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + "'" + o2[i].getToUrl() + + "', anchor: '" + o2[i].getAnchor() + "'", false); } }
Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java (original) +++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java Thu Jan 29 05:38:59 2015 @@ -33,69 +33,54 @@ import org.slf4j.LoggerFactory; public class TestHtmlParser { - public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class); + public static final Logger LOG = LoggerFactory + .getLogger(TestHtmlParser.class); - private static final String encodingTestKeywords = - "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά"; - private static final String encodingTestBody = - "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>"; - private static final String encodingTestContent = - "<title>" + encodingTestKeywords + "</title>\n" - + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "</meta>\n" - + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; - - private static String[][] encodingTestPages= { - { - "HTML4, utf-8, meta http-equiv, no quotes", - "utf-8", - "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " - + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" - + "<html>\n<head>\n" - + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />" - + encodingTestContent - }, - { - "HTML4, utf-8, meta http-equiv, single quotes", - "utf-8", - "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " - + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" - + "<html>\n<head>\n" - + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" - + encodingTestContent - }, - { - "XHTML, utf-8, meta http-equiv, double quotes", - "utf-8", - "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" - + "<html>\n<head>\n" - + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" - + encodingTestContent - }, - { - "HTML5, utf-8, meta charset", - "utf-8", - "<!DOCTYPE html>\n<html>\n<head>\n" - + "<meta charset=\"utf-8\">" - + encodingTestContent - }, - { - "HTML5, utf-8, BOM", - "utf-8", - "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" - + encodingTestContent - }, - { - "HTML5, utf-16, BOM", - "utf-16", - "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" - + encodingTestContent - } - }; + private static final String encodingTestKeywords = "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά"; + private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>"; + private static final String encodingTestContent = "<title>" + + encodingTestKeywords + "</title>\n" + + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; + + private static String[][] encodingTestPages = { + { + "HTML4, utf-8, meta http-equiv, no quotes", + "utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />" + + encodingTestContent }, + { + "HTML4, utf-8, meta http-equiv, single quotes", + "utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" + + encodingTestContent }, + { + "XHTML, utf-8, meta http-equiv, double quotes", + "utf-8", + "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" + + "<html>\n<head>\n" + + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" + + encodingTestContent }, + { + "HTML5, utf-8, meta charset", + "utf-8", + "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">" + + encodingTestContent }, + { "HTML5, utf-8, BOM", "utf-8", + "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent }, + { "HTML5, utf-16, BOM", "utf-16", + "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } }; private Configuration conf; private Parser parser; - public TestHtmlParser() { + public TestHtmlParser() { conf = NutchConfiguration.create(); parser = new HtmlParser(); parser.setConf(conf); @@ -104,8 +89,8 @@ public class TestHtmlParser { protected Parse parse(byte[] contentBytes) { String dummyUrl = "http://dummy.url/"; return parser.getParse( - new Content(dummyUrl, dummyUrl, contentBytes, "text/html", new Metadata(), - conf)).get(dummyUrl); + new Content(dummyUrl, dummyUrl, contentBytes, "text/html", + new Metadata(), conf)).get(dummyUrl); } @Test Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original) +++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Thu Jan 29 05:38:59 2015 @@ -33,120 +33,96 @@ import org.apache.html.dom.*; public class TestRobotsMetaProcessor { /* - - some sample tags: - - <meta name="robots" content="index,follow"> - <meta name="robots" content="noindex,follow"> - <meta name="robots" content="index,nofollow"> - <meta name="robots" content="noindex,nofollow"> - - <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> - + * + * some sample tags: + * + * <meta name="robots" content="index,follow"> <meta name="robots" + * content="noindex,follow"> <meta name="robots" content="index,nofollow"> + * <meta name="robots" content="noindex,nofollow"> + * + * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> */ + public static String[] tests = { + "<html><head><title>test page</title>" + + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"all\"> " + + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,follow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,follow\"> " + + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + + " some text" + "</body></html>", + + }; - public static String[] tests= - { - "<html><head><title>test page</title>" - + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " - + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"all\"> " - + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " - + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"none\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,nofollow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,follow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,nofollow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,follow\"> " - + "<base href=\"http://www.nutch.org/\">" - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\"> " - + "<base href=\"http://www.nutch.org/base/\">" - + "</head><body>" - + " some text" - + "</body></html>", - - }; - - public static final boolean[][] answers= { - {true, true, true}, // NONE - {false, false, true}, // all - {true, true, true}, // nOnE - {true, true, false}, // none - {true, true, false}, // noindex,nofollow - {true, false, false}, // noindex,follow - {false, true, false}, // index,nofollow - {false, false, false}, // index,follow - {false, false, false}, // missing! + public static final boolean[][] answers = { { true, true, true }, // NONE + { false, false, true }, // all + { true, true, true }, // nOnE + { true, true, false }, // none + { true, true, false }, // noindex,nofollow + { true, false, false }, // noindex,follow + { false, true, false }, // index,nofollow + { false, false, false }, // index,follow + { false, false, false }, // missing! }; private URL[][] currURLsAndAnswers; @Test public void testRobotsMetaProcessor() { - DOMFragmentParser parser= new DOMFragmentParser();; + DOMFragmentParser parser = new DOMFragmentParser(); + ; - try { - currURLsAndAnswers= new URL[][] { - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org/foo/"), - new URL("http://www.nutch.org/")}, - {new URL("http://www.nutch.org"), - new URL("http://www.nutch.org/base/")} - }; + try { + currURLsAndAnswers = new URL[][] { + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/") }, + { new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/") } }; } catch (Exception e) { Assert.assertTrue("couldn't make test URLs!", false); } - for (int i= 0; i < tests.length; i++) { - byte[] bytes= tests[i].getBytes(); + for (int i = 0; i < tests.length; i++) { + byte[] bytes = tests[i].getBytes(); DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); @@ -156,9 +132,8 @@ public class TestRobotsMetaProcessor { e.printStackTrace(); } - HTMLMetaTags robotsMeta= new HTMLMetaTags(); - HTMLMetaProcessor.getMetaTags(robotsMeta, node, - currURLsAndAnswers[i][0]); + HTMLMetaTags robotsMeta = new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); Assert.assertTrue("got index wrong on test " + i, robotsMeta.getNoIndex() == answers[i][0]); @@ -166,13 +141,13 @@ public class TestRobotsMetaProcessor { robotsMeta.getNoFollow() == answers[i][1]); Assert.assertTrue("got cache wrong on test " + i, robotsMeta.getNoCache() == answers[i][2]); - Assert.assertTrue("got base href wrong on test " + i + " (got " - + robotsMeta.getBaseHref() + ")", - ( (robotsMeta.getBaseHref() == null) - && (currURLsAndAnswers[i][1] == null) ) - || ( (robotsMeta.getBaseHref() != null) - && robotsMeta.getBaseHref().equals( - currURLsAndAnswers[i][1]) ) ); + Assert + .assertTrue( + "got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) + || ((robotsMeta.getBaseHref() != null) && robotsMeta + .getBaseHref().equals(currURLsAndAnswers[i][1]))); } } Modified: nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Thu Jan 29 05:38:59 2015 @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.parse.js; import java.io.BufferedReader; @@ -56,9 +56,9 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** - * This class is a heuristic link extractor for JavaScript files and - * code snippets. The general idea of a two-pass regex matching comes from - * Heritrix. Parts of the code come from OutlinkExtractor.java + * This class is a heuristic link extractor for JavaScript files and code + * snippets. The general idea of a two-pass regex matching comes from Heritrix. + * Parts of the code come from OutlinkExtractor.java */ public class JSParseFilter implements HtmlParseFilter, Parser { public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class); @@ -66,9 +66,9 @@ public class JSParseFilter implements Ht private static final int MAX_TITLE_LEN = 80; private Configuration conf; - + public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { + HTMLMetaTags metaTags, DocumentFragment doc) { Parse parse = parseResult.get(content.getUrl()); @@ -82,37 +82,42 @@ public class JSParseFilter implements Ht outlinks.addAll(list); ParseStatus status = parse.getData().getStatus(); String text = parse.getText(); - Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]); - ParseData parseData = new ParseData(status, title, newlinks, - parse.getData().getContentMeta(), - parse.getData().getParseMeta()); + Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks + .size()]); + ParseData parseData = new ParseData(status, title, newlinks, parse + .getData().getContentMeta(), parse.getData().getParseMeta()); // replace original parse obj with new one parseResult.put(content.getUrl(), new ParseText(text), parseData); } return parseResult; } - - private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) { + + private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, + List<Outlink> outlinks) { if (n instanceof Element) { String name = n.getNodeName(); if (name.equalsIgnoreCase("script")) { - /* String lang = null; - Node lNode = n.getAttributes().getNamedItem("language"); - if (lNode == null) lang = "javascript"; - else lang = lNode.getNodeValue(); */ + /* + * String lang = null; Node lNode = + * n.getAttributes().getNamedItem("language"); if (lNode == null) lang = + * "javascript"; else lang = lNode.getNodeValue(); + */ StringBuffer script = new StringBuffer(); NodeList nn = n.getChildNodes(); if (nn.getLength() > 0) { for (int i = 0; i < nn.getLength(); i++) { - if (i > 0) script.append('\n'); + if (i > 0) + script.append('\n'); script.append(nn.item(i).getNodeValue()); } // if (LOG.isInfoEnabled()) { - // LOG.info("script: language=" + lang + ", text: " + script.toString()); + // LOG.info("script: language=" + lang + ", text: " + + // script.toString()); // } Outlink[] links = getJSLinks(script.toString(), "", base); - if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links)); + if (links != null && links.length > 0) + outlinks.addAll(Arrays.asList(links)); // no other children of interest here, go one level up. return; } @@ -124,7 +129,8 @@ public class JSParseFilter implements Ht // Window: onload,onunload // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus // Keyboard: onkeydown,onkeypress,onkeyup - // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup + // Mouse: + // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup Node anode = attrs.item(i); Outlink[] links = null; if (anode.getNodeName().startsWith("on")) { @@ -135,7 +141,8 @@ public class JSParseFilter implements Ht links = getJSLinks(val, "", base); } } - if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links)); + if (links != null && links.length > 0) + outlinks.addAll(Arrays.asList(links)); } } } @@ -144,48 +151,56 @@ public class JSParseFilter implements Ht walk(nl.item(i), parse, metaTags, base, outlinks); } } - + public ParseResult getParse(Content c) { String type = c.getContentType(); - if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript")) + if (type != null && !type.trim().equals("") + && !type.toLowerCase().startsWith("application/x-javascript")) return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT, - "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf()); + "Content not JavaScript: '" + type + "'").getEmptyParseResult( + c.getUrl(), getConf()); String script = new String(c.getContent()); Outlink[] outlinks = getJSLinks(script, "", c.getUrl()); - if (outlinks == null) outlinks = new Outlink[0]; + if (outlinks == null) + outlinks = new Outlink[0]; // Title? use the first line of the script... String title; int idx = script.indexOf('\n'); if (idx != -1) { - if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN; + if (idx > MAX_TITLE_LEN) + idx = MAX_TITLE_LEN; title = script.substring(0, idx); } else { idx = Math.min(MAX_TITLE_LEN, script.length()); title = script.substring(0, idx); } ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, - c.getMetadata()); + c.getMetadata()); return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd)); } - + private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)"; // A simple pattern. This allows also invalid URL characters. private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)"; + // Alternative pattern, which limits valid url characters. - //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)"; - + // private static final String URI_PATTERN = + // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)"; + /** - * This method extracts URLs from literals embedded in JavaScript. + * This method extracts URLs from literals embedded in JavaScript. */ private Outlink[] getJSLinks(String plainText, String anchor, String base) { final List<Outlink> outlinks = new ArrayList<Outlink>(); URL baseURL = null; - + try { baseURL = new URL(base); } catch (Exception e) { - if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", e); } + if (LOG.isErrorEnabled()) { + LOG.error("getJSLinks", e); + } } try { @@ -194,8 +209,8 @@ public class JSParseFilter implements Ht Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK); final Pattern pattern1 = cp.compile(URI_PATTERN, - Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); final PatternMatcher matcher = new Perl5Matcher(); final PatternMatcher matcher1 = new Perl5Matcher(); @@ -204,26 +219,27 @@ public class JSParseFilter implements Ht MatchResult result; String url; - //loop the matches + // loop the matches while (matcher.contains(input, pattern)) { result = matcher.getMatch(); url = result.group(2); PatternMatcherInput input1 = new PatternMatcherInput(url); if (!matcher1.matches(input1, pattern1)) { - //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); } + // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); + // } continue; } if (url.startsWith("www.")) { - url = "http://" + url; + url = "http://" + url; } else { - // See if candidate URL is parseable. If not, pass and move on to + // See if candidate URL is parseable. If not, pass and move on to // the next match. try { url = new URL(baseURL, url).toString(); } catch (MalformedURLException ex) { if (LOG.isTraceEnabled()) { - LOG.trace(" - failed URL parse '" + url + "' and baseURL '" + - baseURL + "'", ex); + LOG.trace(" - failed URL parse '" + url + "' and baseURL '" + + baseURL + "'", ex); } continue; } @@ -237,12 +253,14 @@ public class JSParseFilter implements Ht } catch (Exception ex) { // if it is a malformed URL we just throw it away and continue with // extraction. - if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", ex); } + if (LOG.isErrorEnabled()) { + LOG.error("getJSLinks", ex); + } } final Outlink[] retval; - //create array of the Outlinks + // create array of the Outlinks if (outlinks != null && outlinks.size() > 0) { retval = (Outlink[]) outlinks.toArray(new Outlink[0]); } else { @@ -251,7 +269,7 @@ public class JSParseFilter implements Ht return retval; } - + public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println(JSParseFilter.class.getName() + " file.js baseURL"); @@ -261,10 +279,10 @@ public class JSParseFilter implements Ht BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); StringBuffer sb = new StringBuffer(); String line = null; - while ((line = br.readLine()) != null) + while ((line = br.readLine()) != null) sb.append(line + "\n"); br.close(); - + JSParseFilter parseFilter = new JSParseFilter(); parseFilter.setConf(NutchConfiguration.create()); Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]); Modified: nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java (original) +++ nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * from JavaScript files and embedded JavaScript code snippets. */ package org.apache.nutch.parse.js; + Modified: nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java (original) +++ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java Thu Jan 29 05:38:59 2015 @@ -21,3 +21,4 @@ * (see {@link org.apache.nutch.indexer.metadata}). */ package org.apache.nutch.parse.metatags; + Modified: nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (original) +++ nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Thu Jan 29 05:38:59 2015 @@ -44,11 +44,13 @@ import com.anotherbigidea.io.InStream; * distribution. */ public class SWFParser implements Parser { - public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.swf"); + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.swf"); private Configuration conf = null; - public SWFParser() {} + public SWFParser() { + } public void setConf(Configuration conf) { this.conf = conf; @@ -68,10 +70,12 @@ public class SWFParser implements Parser byte[] raw = content.getContent(); String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); - if (contentLength != null && raw.length != Integer.parseInt(contentLength)) { - return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, - "Content truncated at " + raw.length + - " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf()); + if (contentLength != null + && raw.length != Integer.parseInt(contentLength)) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + + " bytes. Parser can't handle incomplete files.") + .getEmptyParseResult(content.getUrl(), getConf()); } ExtractText extractor = new ExtractText(); @@ -87,7 +91,8 @@ public class SWFParser implements Parser reader.readFile(); text = extractor.getText(); String atext = extractor.getActionText(); - if (atext != null && atext.length() > 0) text += "\n--------\n" + atext; + if (atext != null && atext.length() > 0) + text += "\n--------\n" + atext; // harvest potential outlinks String[] links = extractor.getUrls(); for (int i = 0; i < links.length; i++) { @@ -95,19 +100,25 @@ public class SWFParser implements Parser outlinks.add(out); } Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf); - if (olinks != null) for (int i = 0; i < olinks.length; i++) { - outlinks.add(olinks[i]); - } + if (olinks != null) + for (int i = 0; i < olinks.length; i++) { + outlinks.add(olinks[i]); + } } catch (Exception e) { // run time exception LOG.error("Error, runtime exception: ", e); - return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf()); - } - if (text == null) text = ""; + return new ParseStatus(ParseStatus.FAILED, + "Can't be handled as SWF document. " + e).getEmptyParseResult( + content.getUrl(), getConf()); + } + if (text == null) + text = ""; - Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]); + Outlink[] links = (Outlink[]) outlinks + .toArray(new Outlink[outlinks.size()]); ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, - content.getMetadata()); - return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); + content.getMetadata()); + return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, + parseData)); } /** @@ -120,10 +131,9 @@ public class SWFParser implements Parser in.read(buf); in.close(); SWFParser parser = new SWFParser(); - ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0], - buf, "application/x-shockwave-flash", - new Metadata(), - NutchConfiguration.create())); + ParseResult parseResult = parser.getParse(new Content("file:" + args[0], + "file:" + args[0], buf, "application/x-shockwave-flash", + new Metadata(), NutchConfiguration.create())); Parse p = parseResult.get("file:" + args[0]); System.out.println("Parse Text:"); System.out.println(p.getText()); @@ -168,7 +178,8 @@ class ExtractText extends SWFTagTypesImp StringBuffer res = new StringBuffer(); Iterator<String> it = strings.iterator(); while (it.hasNext()) { - if (res.length() > 0) res.append(' '); + if (res.length() > 0) + res.append(' '); res.append(it.next()); } return res.toString(); @@ -176,10 +187,12 @@ class ExtractText extends SWFTagTypesImp public String getActionText() { StringBuffer res = new StringBuffer(); - String[] strings = (String[])actionStrings.toArray(new String[actionStrings.size()]); + String[] strings = (String[]) actionStrings + .toArray(new String[actionStrings.size()]); Arrays.sort(strings); for (int i = 0; i < strings.length; i++) { - if (i > 0) res.append('\n'); + if (i > 0) + res.append('\n'); res.append(strings[i]); } return res.toString(); @@ -196,14 +209,16 @@ class ExtractText extends SWFTagTypesImp return res; } - public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3, int arg4) throws IOException { + public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3, + int arg4) throws IOException { tagDefineFontInfo(arg0, arg1, arg2, arg3); } /** * SWFTagTypes interface Save the Text Font character code info */ - public void tagDefineFontInfo(int fontId, String fontName, int flags, int[] codes) throws IOException { + public void tagDefineFontInfo(int fontId, String fontName, int flags, + int[] codes) throws IOException { // System.out.println("-defineFontInfo id=" + fontId + ", name=" + // fontName); fontCodes.put(new Integer(fontId), codes); @@ -213,16 +228,16 @@ class ExtractText extends SWFTagTypesImp // XXX codes anyway, so we just give up. /* * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException { - * return null; - * } + * return null; } */ /** * SWFTagTypes interface. Save the character code info. */ - public SWFVectors tagDefineFont2(int id, int flags, String name, int numGlyphs, int ascent, int descent, int leading, - int[] codes, int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2, int[] kernAdjustments) - throws IOException { + public SWFVectors tagDefineFont2(int id, int flags, String name, + int numGlyphs, int ascent, int descent, int leading, int[] codes, + int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2, + int[] kernAdjustments) throws IOException { // System.out.println("-defineFontInfo id=" + id + ", name=" + name); fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]); @@ -232,9 +247,10 @@ class ExtractText extends SWFTagTypesImp /** * SWFTagTypes interface. Dump any initial text in the field. */ - public void tagDefineTextField(int fieldId, String fieldName, String initialText, Rect boundary, int flags, - AlphaColor textColor, int alignment, int fontId, int fontSize, int charLimit, int leftMargin, - int rightMargin, int indentation, int lineSpacing) throws IOException { + public void tagDefineTextField(int fieldId, String fieldName, + String initialText, Rect boundary, int flags, AlphaColor textColor, + int alignment, int fontId, int fontSize, int charLimit, int leftMargin, + int rightMargin, int indentation, int lineSpacing) throws IOException { if (initialText != null) { strings.add(initialText); } @@ -243,7 +259,8 @@ class ExtractText extends SWFTagTypesImp /** * SWFTagTypes interface */ - public SWFText tagDefineText(int id, Rect bounds, Matrix matrix) throws IOException { + public SWFText tagDefineText(int id, Rect bounds, Matrix matrix) + throws IOException { lastBounds = curBounds; curBounds = bounds; return new TextDumper(); @@ -255,7 +272,8 @@ class ExtractText extends SWFTagTypesImp /** * SWFTagTypes interface */ - public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix) throws IOException { + public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix) + throws IOException { lastBounds = curBounds; curBounds = bounds; return new TextDumper(); @@ -273,15 +291,16 @@ class ExtractText extends SWFTagTypesImp public void setY(int y) { if (firstY) firstY = false; - else strings.add("\n"); // Change in Y - dump a new line + else + strings.add("\n"); // Change in Y - dump a new line } /* * There are some issues with this method: sometimes SWF files define their - * own font, so short of OCR we cannot guess what is the glyph code -> character - * mapping. Additionally, some files don't use literal space character, instead - * they adjust glyphAdvances. We don't handle it at all - in such cases the text - * will be all glued together. + * own font, so short of OCR we cannot guess what is the glyph code -> + * character mapping. Additionally, some files don't use literal space + * character, instead they adjust glyphAdvances. We don't handle it at all - + * in such cases the text will be all glued together. */ public void text(int[] glyphIndices, int[] glyphAdvances) { // System.out.println("-text id=" + fontId); @@ -310,9 +329,11 @@ class ExtractText extends SWFTagTypesImp strings.add(new String(chars)); } - public void color(Color color) {} + public void color(Color color) { + } - public void setX(int x) {} + public void setX(int x) { + } public void done() { strings.add("\n"); @@ -367,7 +388,8 @@ class NutchSWFActions extends SWFActionB public void lookupTable(String[] values) throws IOException { for (int i = 0; i < values.length; i++) { - if (!strings.contains(values[i])) strings.add(values[i]); + if (!strings.contains(values[i])) + strings.add(values[i]); } super.lookupTable(values); dict = values; @@ -379,7 +401,7 @@ class NutchSWFActions extends SWFActionB } public void getURL(int vars, int mode) { - // System.out.println("-getURL: vars=" + vars + ", mode=" + mode); + // System.out.println("-getURL: vars=" + vars + ", mode=" + mode); } public void getURL(String url, String target) throws IOException { @@ -444,7 +466,8 @@ class NutchSWFActions extends SWFActionB super.setTarget(var); } - public SWFActionBlock startFunction(String var, String[] params) throws IOException { + public SWFActionBlock startFunction(String var, String[] params) + throws IOException { stack.push(var); strings.remove(var); if (params != null) { @@ -455,7 +478,8 @@ class NutchSWFActions extends SWFActionB return this; } - public SWFActionBlock startFunction2(String var, int arg1, int arg2, String[] params, int[] arg3) throws IOException { + public SWFActionBlock startFunction2(String var, int arg1, int arg2, + String[] params, int[] arg3) throws IOException { stack.push(var); strings.remove(var); if (params != null) { @@ -655,6 +679,7 @@ class SmallStack extends Stack<Object> { // tolerate underruns if (this.size() == 0) return null; - else return super.pop(); + else + return super.pop(); } } Modified: nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java (original) +++ nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Parse Flash SWF files. */ package org.apache.nutch.parse.swf; + Modified: nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java (original) +++ nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java Thu Jan 29 05:38:59 2015 @@ -34,17 +34,19 @@ import org.apache.nutch.util.NutchConfig import org.junit.Assert; import org.junit.Test; -/** +/** * Unit tests for SWFParser. */ public class TestSWFParser { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data","."); + private String sampleDir = System.getProperty("test.data", "."); - private String[] sampleFiles = new String[]{"test1.swf", "test2.swf", "test3.swf"}; - private String[] sampleTexts = new String[]{"test1.txt", "test2.txt", "test3.txt"}; + private String[] sampleFiles = new String[] { "test1.swf", "test2.swf", + "test3.swf" }; + private String[] sampleTexts = new String[] { "test1.txt", "test2.txt", + "test3.txt" }; @Test public void testIt() throws ProtocolException, ParseException { @@ -58,7 +60,8 @@ public class TestSWFParser { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); parse = new ParseUtil(conf).parse(content).get(content.getUrl()); @@ -67,11 +70,12 @@ public class TestSWFParser { } } - public TestSWFParser() { + public TestSWFParser() { for (int i = 0; i < sampleFiles.length; i++) { try { // read the test string - FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + sampleTexts[i]); + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + + sampleTexts[i]); StringBuffer sb = new StringBuffer(); int len = 0; InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
