Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java Fri Jan 9 06:34:33 2015 @@ -46,377 +46,342 @@ import java.util.StringTokenizer; */ public class DOMContentUtilsTest { - private static final String[] testPages = { - // 0. - new String( - "<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"http://www.nutch.org\">" - + " anchor </a><!--comment-->" + "</body></html>"), - // 1. - new String( - "<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"/\">" - + " home </a><!--comment-->" - + "<style> style </style>" - + " <a href=\"bot.html\">" + " bots </a>" - + "</body></html>"), - // 2. - new String("<html><head><title> </title>" + "</head><body> " - + "<a href=\"/\"> separate this " - + "<a href=\"ok\"> from this" + "</a></a>" - + "</body></html>"), - // 3. - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ... <li> <a href=/> home </a> </li> - // <li> <a href=/> <a href="1"> 1 </a> </a> </li> - // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> - new String("<html><head><title> my title </title>" - + "</head><body> body " + "<ul>" - + "<li> <a href=\"/\"> home" + "<li> <a href=\"1\"> 1" - + "<li> <a href=\"2\"> 2" + "</ul>" + "</body></html>"), - // 4. - // test frameset link extraction. The invalid frame in the middle - // will be - // fixed to a third standalone frame. - new String("<html><head><title> my title </title>" - + "</head><frameset rows=\"20,*\"> " - + "<frame src=\"top.html\">" + "</frame>" - + "<frameset cols=\"20,*\">" + "<frame src=\"left.html\">" - + "</frame>" + "<frame src=\"invalid.html\"/>" + "</frame>" - + "<frame src=\"right.html\">" + "</frame>" + "</frameset>" - + "</frameset>" + "</body></html>"), - // 5. - // test <area> and <iframe> link extraction + url normalization - new String( - "<html><head><title> my title </title>" - + "</head><body>" - + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" - + "<map name=\"green\">" - + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" - + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" - + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" - + "</map>" - + "<a name=\"bottom\"/><h1> the bottom </h1> " - + "<iframe src=\"../docs/index.html\"/>" - + "</body></html>"), - // 6. - // test whitespace processing for plain text extraction - new String( - "<html><head>\n <title> my\t\n title\r\n </title>\n" - + " </head>\n" - + " <body>\n" - + " <h1> Whitespace\ttest </h1> \n" - + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" - + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" - + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" - + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" - + "<table>" - + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" - + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" - + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" - + "</table>put some text here<Br>and there." - + "<h2>End\tthis\rmadness\n!</h2>\r\n" - + " . . . ." - + "</body> </html>"), - // 7. - // test that <a rel=nofollow> links are not returned - new String( - "<html><head></head><body>" - + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" - + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" - + "</body></html>"), - // 8. - // test that POST form actions are skipped - new String( - "<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // 9. - // test that all form actions are skipped - new String( - "<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // 10. - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\";x\">anchor1</a>" - + "<a href=\"g;x\">anchor2</a>" - + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), - // 11. - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\"g\">anchor1</a>" - + "<a href=\"g?y#s\">anchor2</a>" - + "<a href=\"?y=1\">anchor3</a>" - + "<a href=\"?y=1#s\">anchor4</a>" - + "<a href=\"?y=1;somethingelse\">anchor5</a>" - + "</body></html>"), }; - - private static int SKIP = 9; - - private static String[] testBaseHrefs = { "http://www.nutch.org", - "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", - "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", - "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", - "http://www.nutch.org//", "http://www.nutch.org/", - "http://www.nutch.org/", "http://www.nutch.org/", - "http://www.nutch.org/;something" }; - - private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; - - private static URL[] testBaseHrefURLs = new URL[testPages.length]; - - private static final String[] answerText = { - "body anchor", - "body home bots", - "separate this from this", - "body home 1 2", - "", - "the bottom", - "Whitespace test whitespace test " - + "This is a whitespace test . Newlines should appear as space too. " - + "Tabs are spaces too. This is a break -> and the line after break . " - + "one two three space here space there no space " - + "one two two three three four put some text here and there. " - + "End this madness ! . . . .", "ignore ignore", - "test1 test2", "test1 test2", "anchor1 anchor2 anchor3", - "anchor1 anchor2 anchor3 anchor4 anchor5" }; - - private static final String[] answerTitle = { "title", "title", "", - "my title", "my title", "my title", "my title", "", "", "", - "title", "title" }; - - // note: should be in page-order - private static Outlink[][] answerOutlinks; - - private static Configuration conf; - private static DOMContentUtils utils = null; - - public static final Logger Logger = LoggerFactory.getLogger(DOMContentUtilsTest.class); - - public DOMContentUtilsTest(String name) { - } - - private static void setup() throws Exception { - conf = NutchConfiguration.create(); - conf.setBoolean("parser.html.form.use_action", true); - utils = new DOMContentUtils(conf); - TikaParser tikaParser = new TikaParser(); - tikaParser.setConf(conf); - Parser parser = tikaParser.getTikaConfig().getParser("text/html"); - for (int i = 0; i < testPages.length; i++) { - Metadata tikamd = new Metadata(); - - HTMLDocumentImpl doc = new HTMLDocumentImpl(); - doc.setErrorChecking(false); - DocumentFragment root = doc.createDocumentFragment(); - DOMBuilder domhandler = new DOMBuilder(doc, root); - ParseContext context = new ParseContext(); - // to add once available in Tika - //context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); - try { - parser.parse(new ByteArrayInputStream(testPages[i].getBytes()), - domhandler, tikamd, context); - testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); - } catch (Exception e) { - e.printStackTrace(); - fail("caught exception: " + e); - } - testDOMs[i] = root; - LSSerializerImpl lsi = new LSSerializerImpl(); - System.out.println("input " + i + ": '" + testPages[i] + "'"); - System.out.println("output " + i + ": '" + lsi.writeToString(root) - + "'"); - - } - answerOutlinks = new Outlink[][] { - // 0 - { new Outlink("http://www.nutch.org", "anchor"), }, - // 1 - { - new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/bot.html", - "bots"), }, - // 2 - { - new Outlink("http://www.nutch.org/", "separate this"), - new Outlink("http://www.nutch.org/docs/ok", "from this"), }, - - // 3 - { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/1", "1"), - new Outlink("http://www.nutch.org/docs/2", "2"), }, - // 4 - { - new Outlink("http://www.nutch.org/frames/top.html", ""), - new Outlink("http://www.nutch.org/frames/left.html", ""), - new Outlink("http://www.nutch.org/frames/invalid.html",""), - new Outlink("http://www.nutch.org/frames/right.html",""), - }, - // 5 - { - new Outlink("http://www.nutch.org/maps/logo.gif", ""), - new Outlink("http://www.nutch.org/index.html", ""), - new Outlink("http://www.nutch.org/maps/#bottom", ""), - new Outlink("http://www.nutch.org/bot.html", ""), - new Outlink("http://www.nutch.org/docs/index.html", "") - }, - // 6 - { new Outlink("http://www.nutch.org/index.html", - "whitespace test"), - }, - // 7 - {}, - // 8 - { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, - // 9 - {}, - // 10 - { - new Outlink("http://www.nutch.org/;x", "anchor1"), - new Outlink("http://www.nutch.org/g;x", "anchor2"), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") - }, - // 11 - { - // this is tricky - see RFC3986 section 5.4.1 example 7 - new Outlink("http://www.nutch.org/g","anchor1"), - new Outlink("http://www.nutch.org/g?y#s", "anchor2"), - new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), - new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), - new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") } - }; - - } - - private static boolean equalsIgnoreWhitespace(String s1, String s2) { - StringTokenizer st1 = new StringTokenizer(s1); - StringTokenizer st2 = new StringTokenizer(s2); - - while (st1.hasMoreTokens()) { - if (!st2.hasMoreTokens()) { - Logger.info("st1+ '" + st1.nextToken() + "'"); - return false; - } - String st1Token = st1.nextToken(); - String st2Token = st2.nextToken(); - if (!st1Token.equals(st2Token)) { - Logger.info("st1:'" + st1Token + "' != st2:'" + st2Token + "'"); - return false; - } - } - if (st2.hasMoreTokens()) { - System.err.println("st2+ '" + st2.nextToken() + "'"); - return false; - } - return true; - } - - @Test - public void testGetText() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - StringBuffer sb = new StringBuffer(); - utils.getText(sb, testDOMs[i]); - String text = sb.toString(); - assertTrue( - "example " + i + " : expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") - + "got text: " + text, - equalsIgnoreWhitespace(answerText[i], text)); - } - } - - // won't work with Tika - the title is stored in the metadata but - // not put in the XHTML representation - @Test - public void testGetTitle() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - StringBuffer sb = new StringBuffer(); - utils.getTitle(sb, testDOMs[i]); - String title = sb.toString(); - assertTrue( - "example " + i + " : expecting title: " + answerTitle[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") - + "got title: " + title, - equalsIgnoreWhitespace(answerTitle[i], title)); - } - } - - @Test - public void testGetOutlinks() throws Exception { - if (testDOMs[0] == null) - setup(); - for (int i = 0; i < testPages.length; i++) { - ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); - if (i == SKIP) { - conf.setBoolean("parser.html.form.use_action", false); - utils.setConf(conf); - } else { - conf.setBoolean("parser.html.form.use_action", true); - utils.setConf(conf); - } - utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); - Outlink[] outlinkArr = new Outlink[outlinks.size()]; - outlinkArr = outlinks.toArray(outlinkArr); - compareOutlinks(i, answerOutlinks[i], outlinkArr); - } - } - - private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { - for (int i = 0; i < o.length; i++) { - sb.append(o[i].toString()); - sb.append(System.getProperty("line.separator")); - } - } - - private static final String outlinksString(Outlink[] o) { - StringBuffer sb = new StringBuffer(); - appendOutlinks(sb, o); - return sb.toString(); - } - - private static final void compareOutlinks(int test, Outlink[] o1, - Outlink[] o2) { - if (o1.length != o2.length) { - assertTrue( - "test " + test - + ", got wrong number of outlinks (expecting " - + o1.length + ", got " + o2.length + ")" - + System.getProperty("line.separator") + "answer: " - + System.getProperty("line.separator") - + outlinksString(o1) - + System.getProperty("line.separator") + "got: " - + System.getProperty("line.separator") - + outlinksString(o2) - + System.getProperty("line.separator"), false); - } - - for (int i = 0; i < o1.length; i++) { - if (!o1[i].equals(o2[i])) { - assertTrue( - "test " + test + ", got wrong outlinks at position " - + i + System.getProperty("line.separator") - + "answer: " - + System.getProperty("line.separator") - + o1[i].toString() - + System.getProperty("line.separator") - + "got: " - + System.getProperty("line.separator") - + o2[i].toString(), false); - - } - } - } + private static final String[] testPages = { + // 0. + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + "</body></html>"), + // 1. + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" + + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" + + "</body></html>"), + // 2. + new String("<html><head><title> </title>" + "</head><body> " + + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" + + "</a></a>" + "</body></html>"), + // 3. + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" + + "</body></html>"), + // 4. + // test frameset link extraction. The invalid frame in the middle + // will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" + + "</frame>" + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + "</frame>" + + "<frame src=\"invalid.html\"/>" + "</frame>" + + "<frame src=\"right.html\">" + "</frame>" + "</frameset>" + + "</frameset>" + "</body></html>"), + // 5. + // test <area> and <iframe> link extraction + url normalization + new String( + "<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), + // 6. + // test whitespace processing for plain text extraction + new String( + "<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + "</body> </html>"), + // 7. + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // 8. + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // 9. + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // 10. + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), + // 11. + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), }; + + private static int SKIP = 9; + + private static String[] testBaseHrefs = { "http://www.nutch.org", + "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", "http://www.nutch.org/", + "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/;something" }; + + private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; + + private static URL[] testBaseHrefURLs = new URL[testPages.length]; + + private static final String[] answerText = { + "body anchor", + "body home bots", + "separate this from this", + "body home 1 2", + "", + "the bottom", + "Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "anchor1 anchor2 anchor3", + "anchor1 anchor2 anchor3 anchor4 anchor5" }; + + private static final String[] answerTitle = { "title", "title", "", + "my title", "my title", "my title", "my title", "", "", "", "title", + "title" }; + + // note: should be in page-order + private static Outlink[][] answerOutlinks; + + private static Configuration conf; + private static DOMContentUtils utils = null; + + public static final Logger Logger = LoggerFactory + .getLogger(DOMContentUtilsTest.class); + + public DOMContentUtilsTest(String name) { + } + + private static void setup() throws Exception { + conf = NutchConfiguration.create(); + conf.setBoolean("parser.html.form.use_action", true); + utils = new DOMContentUtils(conf); + TikaParser tikaParser = new TikaParser(); + tikaParser.setConf(conf); + Parser parser = tikaParser.getTikaConfig().getParser("text/html"); + for (int i = 0; i < testPages.length; i++) { + Metadata tikamd = new Metadata(); + + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + doc.setErrorChecking(false); + DocumentFragment root = doc.createDocumentFragment(); + DOMBuilder domhandler = new DOMBuilder(doc, root); + ParseContext context = new ParseContext(); + // to add once available in Tika + // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); + try { + parser.parse(new ByteArrayInputStream(testPages[i].getBytes()), + domhandler, tikamd, context); + testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); + } catch (Exception e) { + e.printStackTrace(); + fail("caught exception: " + e); + } + testDOMs[i] = root; + LSSerializerImpl lsi = new LSSerializerImpl(); + System.out.println("input " + i + ": '" + testPages[i] + "'"); + System.out.println("output " + i + ": '" + lsi.writeToString(root) + "'"); + + } + answerOutlinks = new Outlink[][] { + // 0 + { new Outlink("http://www.nutch.org", "anchor"), }, + // 1 + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, + // 2 + { new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/ok", "from this"), }, + + // 3 + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, + // 4 + { new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, + // 5 + { new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", "") }, + // 6 + { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + // 7 + {}, + // 8 + { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, + // 9 + {}, + // 10 + { new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, + // 11 + { + // this is tricky - see RFC3986 section 5.4.1 example 7 + new Outlink("http://www.nutch.org/g", "anchor1"), + new Outlink("http://www.nutch.org/g?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/;something?y=1;somethingelse", + "anchor5") } }; + + } + + private static boolean equalsIgnoreWhitespace(String s1, String s2) { + StringTokenizer st1 = new StringTokenizer(s1); + StringTokenizer st2 = new StringTokenizer(s2); + + while (st1.hasMoreTokens()) { + if (!st2.hasMoreTokens()) { + Logger.info("st1+ '" + st1.nextToken() + "'"); + return false; + } + String st1Token = st1.nextToken(); + String st2Token = st2.nextToken(); + if (!st1Token.equals(st2Token)) { + Logger.info("st1:'" + st1Token + "' != st2:'" + st2Token + "'"); + return false; + } + } + if (st2.hasMoreTokens()) { + System.err.println("st2+ '" + st2.nextToken() + "'"); + return false; + } + return true; + } + + @Test + public void testGetText() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getText(sb, testDOMs[i]); + String text = sb.toString(); + assertTrue( + "example " + i + " : expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerText[i], text)); + } + } + + // won't work with Tika - the title is stored in the metadata but + // not put in the XHTML representation + @Test + public void testGetTitle() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + StringBuffer sb = new StringBuffer(); + utils.getTitle(sb, testDOMs[i]); + String title = sb.toString(); + assertTrue( + "example " + i + " : expecting title: " + answerTitle[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got title: " + title, + equalsIgnoreWhitespace(answerTitle[i], title)); + } + } + + @Test + public void testGetOutlinks() throws Exception { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); + if (i == SKIP) { + conf.setBoolean("parser.html.form.use_action", false); + utils.setConf(conf); + } else { + conf.setBoolean("parser.html.form.use_action", true); + utils.setConf(conf); + } + utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); + Outlink[] outlinkArr = new Outlink[outlinks.size()]; + outlinkArr = outlinks.toArray(outlinkArr); + compareOutlinks(i, answerOutlinks[i], outlinkArr); + } + } + + private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { + for (int i = 0; i < o.length; i++) { + sb.append(o[i].toString()); + sb.append(System.getProperty("line.separator")); + } + } + + private static final String outlinksString(Outlink[] o) { + StringBuffer sb = new StringBuffer(); + appendOutlinks(sb, o); + return sb.toString(); + } + + private static final void compareOutlinks(int test, Outlink[] o1, Outlink[] o2) { + if (o1.length != o2.length) { + assertTrue( + "test " + test + ", got wrong number of outlinks (expecting " + + o1.length + ", got " + o2.length + ")" + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + outlinksString(o1) + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + outlinksString(o2) + + System.getProperty("line.separator"), false); + } + + for (int i = 0; i < o1.length; i++) { + if (!o1[i].equals(o2[i])) { + assertTrue( + "test " + test + ", got wrong outlinks at position " + i + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + o1[i].toString() + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + o2[i].toString(), + false); + + } + } + } }
Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java Fri Jan 9 06:34:33 2015 @@ -39,9 +39,9 @@ public class TestImageMetadata { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data","."); + private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in - private String[] sampleFiles = {"nutch_logo_tm.gif"}; + private String[] sampleFiles = { "nutch_logo_tm.gif" }; @Test public void testIt() throws ProtocolException, ParseException, IOException { @@ -50,32 +50,32 @@ public class TestImageMetadata { Parse parse; Configuration conf = NutchConfiguration.create(); MimeUtil mimeutil = new MimeUtil(conf); - + for (int i = 0; i < sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - + File file = new File(sampleDir + fileSeparator + sampleFiles[i]); byte[] bytes = new byte[(int) file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); in.close(); - + WebPage page = WebPage.newBuilder().build(); page.setBaseUrl(new Utf8(urlString)); page.setContent(ByteBuffer.wrap(bytes)); String mtype = mimeutil.getMimeType(file); page.setContentType(new Utf8(mtype)); - + parse = new ParseUtil(conf).parse(urlString, page); - - //assert width + + // assert width ByteBuffer bbufW = page.getMetadata().get(new Utf8("width")); byte[] byteArrayW = new byte[bbufW.remaining()]; bbufW.get(byteArrayW); String width = new String(byteArrayW); assertEquals("121", width); - - //assert height + + // assert height ByteBuffer bbufH = page.getMetadata().get(new Utf8("height")); byte[] byteArrayH = new byte[bbufH.remaining()]; bbufH.get(byteArrayH); Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java Fri Jan 9 06:34:33 2015 @@ -44,63 +44,63 @@ import static org.junit.Assert.assertTru */ public class TestMSWordParser { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-msword/build.xml during plugin compilation. - // Check ./src/plugin/parse-msword/sample/README.txt for what they are. - private String[] sampleFiles = { "word97.doc" }; - - private String expectedText = "This is a sample doc file prepared for nutch."; - - private Configuration conf; - - @Before - public void setUp() { - conf = NutchConfiguration.create(); - conf.set("file.content.limit", "-1"); + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-msword/build.xml during plugin compilation. + // Check ./src/plugin/parse-msword/sample/README.txt for what they are. + private String[] sampleFiles = { "word97.doc" }; + + private String expectedText = "This is a sample doc file prepared for nutch."; + + private Configuration conf; + + @Before + public void setUp() { + conf = NutchConfiguration.create(); + conf.set("file.content.limit", "-1"); + } + + public String getTextContent(String fileName) throws ProtocolException, + ParseException, IOException { + String urlString = sampleDir + fileSeparator + fileName; + + File file = new File(urlString); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + in.close(); + Parse parse; + WebPage page = WebPage.newBuilder().build(); + page.setBaseUrl(new Utf8("file:" + urlString)); + page.setContent(ByteBuffer.wrap(bytes)); + // set the content type? + MimeUtil mimeutil = new MimeUtil(conf); + String mtype = mimeutil.getMimeType(file); + page.setContentType(new Utf8(mtype)); + + parse = new ParseUtil(conf).parse("file:" + urlString, page); + return parse.getText(); + } + + @Test + public void testIt() throws ProtocolException, ParseException, IOException { + for (int i = 0; i < sampleFiles.length; i++) { + String found = getTextContent(sampleFiles[i]); + assertTrue("text found : '" + found + "'", found.startsWith(expectedText)); } + } - public String getTextContent(String fileName) throws ProtocolException, - ParseException, IOException { - String urlString = sampleDir + fileSeparator + fileName; - - File file = new File(urlString); - byte[] bytes = new byte[(int) file.length()]; - DataInputStream in = new DataInputStream(new FileInputStream(file)); - in.readFully(bytes); - in.close(); - Parse parse; - WebPage page = WebPage.newBuilder().build(); - page.setBaseUrl(new Utf8("file:"+urlString)); - page.setContent(ByteBuffer.wrap(bytes)); - // set the content type? - MimeUtil mimeutil = new MimeUtil(conf); - String mtype = mimeutil.getMimeType(file); - page.setContentType(new Utf8(mtype)); - - parse = new ParseUtil(conf).parse("file:"+urlString, page); - return parse.getText(); - } - - @Test - public void testIt() throws ProtocolException, ParseException, IOException { - for (int i = 0; i < sampleFiles.length; i++) { - String found = getTextContent(sampleFiles[i]); - assertTrue("text found : '" + found + "'", found - .startsWith(expectedText)); - } - } - - @Test - public void testOpeningDocs() throws ProtocolException, ParseException, IOException { - String[] filenames = new File(sampleDir).list(); - for (int i = 0; i < filenames.length; i++) { - if (filenames[i].endsWith(".doc") == false) - continue; - assertTrue("cann't read content of " + filenames[i], - getTextContent(filenames[i]).length() > 0); - } + @Test + public void testOpeningDocs() throws ProtocolException, ParseException, + IOException { + String[] filenames = new File(sampleDir).list(); + for (int i = 0; i < filenames.length; i++) { + if (filenames[i].endsWith(".doc") == false) + continue; + assertTrue("cann't read content of " + filenames[i], + getTextContent(filenames[i]).length() > 0); } + } } Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java Fri Jan 9 06:34:33 2015 @@ -57,11 +57,11 @@ public class TestOOParser { Parse parse; Configuration conf = NutchConfiguration.create(); MimeUtil mimeutil = new MimeUtil(conf); - + try { // read the test string FileInputStream fis = new FileInputStream(sampleDir + fileSeparator - + sampleText); + + sampleText); StringBuffer sb = new StringBuffer(); int len = 0; InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); @@ -83,7 +83,7 @@ public class TestOOParser { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; if (sampleFiles[i].startsWith("ootest") == false) - continue; + continue; File file = new File(sampleDir + fileSeparator + sampleFiles[i]); byte[] bytes = new byte[(int) file.length()]; Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java Fri Jan 9 06:34:33 2015 @@ -43,43 +43,43 @@ import static org.junit.Assert.assertTru */ public class TestPdfParser { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-pdf/build.xml during plugin compilation. - // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. - private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; - - private String expectedText = "A VERY SMALL PDF FILE"; - - @Test - public void testIt() throws ProtocolException, ParseException, IOException { - String urlString; - Parse parse; - Configuration conf = NutchConfiguration.create(); - MimeUtil mimeutil = new MimeUtil(conf); - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - File file = new File(sampleDir + fileSeparator + sampleFiles[i]); - byte[] bytes = new byte[(int) file.length()]; - DataInputStream in = new DataInputStream(new FileInputStream(file)); - in.readFully(bytes); - in.close(); - - WebPage page = WebPage.newBuilder().build(); - page.setBaseUrl(new Utf8(urlString)); - page.setContent(ByteBuffer.wrap(bytes)); - String mtype = mimeutil.getMimeType(file); - page.setContentType(new Utf8(mtype)); - - parse = new ParseUtil(conf).parse(urlString, page); - - int index = parse.getText().indexOf(expectedText); - assertTrue(index > 0); - } + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-pdf/build.xml during plugin compilation. + // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. + private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; + + private String expectedText = "A VERY SMALL PDF FILE"; + + @Test + public void testIt() throws ProtocolException, ParseException, IOException { + String urlString; + Parse parse; + Configuration conf = NutchConfiguration.create(); + MimeUtil mimeutil = new MimeUtil(conf); + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + File file = new File(sampleDir + fileSeparator + sampleFiles[i]); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + in.close(); + + WebPage page = WebPage.newBuilder().build(); + page.setBaseUrl(new Utf8(urlString)); + page.setContent(ByteBuffer.wrap(bytes)); + String mtype = mimeutil.getMimeType(file); + page.setContentType(new Utf8(mtype)); + + parse = new ParseUtil(conf).parse(urlString, page); + + int index = parse.getText().indexOf(expectedText); + assertTrue(index > 0); } + } } Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java Fri Jan 9 06:34:33 2015 @@ -68,7 +68,7 @@ public class TestRSSParser { * </ul> */ @Test - public void testIt()throws ProtocolException, ParseException, IOException { + public void testIt() throws ProtocolException, ParseException, IOException { String urlString; Parse parse; Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java Fri Jan 9 06:34:33 2015 @@ -44,50 +44,49 @@ import static org.junit.Assert.assertEqu */ public class TestRTFParser { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-rtf/build.xml during plugin compilation. - // Check ./src/plugin/parse-rtf/sample/README.txt for what they are. - private String rtfFile = "test.rtf"; - - @Test - public void testIt() throws ProtocolException, ParseException, IOException { - - String urlString; - Parse parse; - Configuration conf = NutchConfiguration.create(); - MimeUtil mimeutil = new MimeUtil(conf); - - urlString = "file:" + sampleDir + fileSeparator + rtfFile; - - File file = new File(sampleDir + fileSeparator + rtfFile); - byte[] bytes = new byte[(int) file.length()]; - DataInputStream in = new DataInputStream(new FileInputStream(file)); - in.readFully(bytes); - in.close(); - - WebPage page = WebPage.newBuilder().build(); - page.setBaseUrl(new Utf8(urlString)); - page.setContent(ByteBuffer.wrap(bytes)); - String mtype = mimeutil.getMimeType(file); - page.setContentType(new Utf8(mtype)); - - parse = new ParseUtil(conf).parse(urlString, page); - - String title = parse.getTitle(); - String text = parse.getText(); - assertEquals("test rft document", title); - //assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); - - - // HOW DO WE GET THE PARSE METADATA? - // Metadata meta = parse(); - - // METADATA extraction is not yet supported in Tika - // - // assertEquals("tests", meta.get(DublinCore.SUBJECT)); - } + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-rtf/build.xml during plugin compilation. + // Check ./src/plugin/parse-rtf/sample/README.txt for what they are. + private String rtfFile = "test.rtf"; + + @Test + public void testIt() throws ProtocolException, ParseException, IOException { + + String urlString; + Parse parse; + Configuration conf = NutchConfiguration.create(); + MimeUtil mimeutil = new MimeUtil(conf); + + urlString = "file:" + sampleDir + fileSeparator + rtfFile; + + File file = new File(sampleDir + fileSeparator + rtfFile); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + in.close(); + + WebPage page = WebPage.newBuilder().build(); + page.setBaseUrl(new Utf8(urlString)); + page.setContent(ByteBuffer.wrap(bytes)); + String mtype = mimeutil.getMimeType(file); + page.setContentType(new Utf8(mtype)); + + parse = new ParseUtil(conf).parse(urlString, page); + + String title = parse.getTitle(); + String text = parse.getText(); + assertEquals("test rft document", title); + // assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); + + // HOW DO WE GET THE PARSE METADATA? + // Metadata meta = parse(); + + // METADATA extraction is not yet supported in Tika + // + // assertEquals("tests", meta.get(DublinCore.SUBJECT)); + } } Modified: nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original) +++ nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Fri Jan 9 06:34:33 2015 @@ -32,10 +32,11 @@ import java.util.Collection; import java.util.HashSet; /** - * This class is a protocol plugin used for file: scheme. - * It creates {@link FileResponse} object and gets the content of the url from it. - * Configurable parameters are {@code file.content.limit} and {@code file.crawl.parent} - * in nutch-default.xml defined under "file properties" section. + * This class is a protocol plugin used for file: scheme. It creates + * {@link FileResponse} object and gets the content of the url from it. + * Configurable parameters are {@code file.content.limit} and + * {@code file.crawl.parent} in nutch-default.xml defined under + * "file properties" section. */ public class File implements Protocol { @@ -51,7 +52,7 @@ public class File implements Protocol { static final int MAX_REDIRECTS = 5; int maxContentLength; - + boolean crawlParents; /** @@ -63,7 +64,8 @@ public class File implements Protocol { private Configuration conf; // constructor - public File() { } + public File() { + } /** * Set the {@link Configuration} object @@ -75,29 +77,32 @@ public class File implements Protocol { this.symlinksAsRedirects = conf.getBoolean( "file.crawl.redirect_noncanonical", true); } - + /** * Get the {@link Configuration} object */ public Configuration getConf() { return this.conf; } - - /** - * Set the point at which content is truncated. + + /** + * Set the point at which content is truncated. */ public void setMaxContentLength(int maxContentLength) { this.maxContentLength = maxContentLength; } - - /** - * Creates a {@link FileResponse} object corresponding to the url and - * return a {@link ProtocolOutput} object as per the content received + + /** + * Creates a {@link FileResponse} object corresponding to the url and return a + * {@link ProtocolOutput} object as per the content received * - * @param url Text containing the url - * @param datum The CrawlDatum object corresponding to the url + * @param url + * Text containing the url + * @param datum + * The CrawlDatum object corresponding to the url * - * @return {@link ProtocolOutput} object for the content of the file indicated by url + * @return {@link ProtocolOutput} object for the content of the file indicated + * by url */ public ProtocolOutput getProtocolOutput(String url, WebPage page) { String urlString = url.toString(); @@ -115,13 +120,16 @@ public class File implements Protocol { return new ProtocolOutput(response.toContent()); // return it } else if (code == 304) { // got not modified - return new ProtocolOutput(response.toContent(), ProtocolStatusUtils.STATUS_NOTMODIFIED); + return new ProtocolOutput(response.toContent(), + ProtocolStatusUtils.STATUS_NOTMODIFIED); } else if (code == 401) { // access denied / no read permissions - return new ProtocolOutput(response.toContent(), ProtocolStatusUtils.makeStatus(ProtocolStatusUtils.ACCESS_DENIED)); + return new ProtocolOutput(response.toContent(), + ProtocolStatusUtils.makeStatus(ProtocolStatusUtils.ACCESS_DENIED)); } else if (code == 404) { // no such file - return new ProtocolOutput(response.toContent(), ProtocolStatusUtils.STATUS_NOTFOUND); + return new ProtocolOutput(response.toContent(), + ProtocolStatusUtils.STATUS_NOTFOUND); } else if (code >= 300 && code < 400) { // handle redirect u = new URL(response.getHeader("Location")); @@ -156,8 +164,8 @@ public class File implements Protocol { return FIELDS; } - /** - * Quick way for running this class. Useful for debugging. + /** + * Quick way for running this class. Useful for debugging. */ public static void main(String[] args) throws Exception { int maxContentLength = Integer.MIN_VALUE; @@ -216,11 +224,11 @@ public class File implements Protocol { file = null; } - /** - * No robots parsing is done for file protocol. - * So this returns a set of empty rules which will allow every url. + /** + * No robots parsing is done for file protocol. So this returns a set of empty + * rules which will allow every url. */ public BaseRobotRules getRobotRules(String url, WebPage page) { return RobotRulesParser.EMPTY_RULES; - } + } } Modified: nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java (original) +++ nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java Fri Jan 9 06:34:33 2015 @@ -17,13 +17,16 @@ package org.apache.nutch.protocol.file; -/** Thrown for File error codes. +/** + * Thrown for File error codes. */ public class FileError extends FileException { private int code; - - public int getCode(int code) { return code; } + + public int getCode(int code) { + return code; + } public FileError(int code) { super("File Error: " + code); Modified: nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original) +++ nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Fri Jan 9 06:34:33 2015 @@ -30,31 +30,27 @@ import org.apache.nutch.protocol.Content import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.MimeUtil; - /************************************ - * FileResponse.java mimics file replies as http response. - * It tries its best to follow http's way for headers, response codes - * as well as exceptions. - * - * Comments: - * (1) java.net.URL and java.net.URLConnection can handle file: scheme. - * However they are not flexible enough, so not used in this implementation. - * - * (2) java.io.File is used for its abstractness across platforms. - * Warning: - * java.io.File API (1.4.2) does not elaborate on how special files, - * such as /dev/* in unix and /proc/* on linux, are treated. Tests show - * (a) java.io.File.isFile() return false for /dev/* - * (b) java.io.File.isFile() return true for /proc/* - * (c) java.io.File.length() return 0 for /proc/* - * We are probably oaky for now. Could be buggy here. - * How about special files on windows? - * - * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. - * They are just treated as individual files. - * + * FileResponse.java mimics file replies as http response. It tries its best to + * follow http's way for headers, response codes as well as exceptions. + * + * Comments: (1) java.net.URL and java.net.URLConnection can handle file: + * scheme. However they are not flexible enough, so not used in this + * implementation. + * + * (2) java.io.File is used for its abstractness across platforms. Warning: + * java.io.File API (1.4.2) does not elaborate on how special files, such as + * /dev/* in unix and /proc/* on linux, are treated. Tests show (a) + * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile() + * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are + * probably oaky for now. Could be buggy here. How about special files on + * windows? + * + * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They + * are just treated as individual files. + * * (4) No funcy POSIX file attributes yet. May never need? - * + * * @author John Xing ***********************************/ public class FileResponse { @@ -68,33 +64,36 @@ public class FileResponse { private final File file; private Configuration conf; - + private MimeUtil MIME; /** Returns the response code. */ - public int getCode() { return code; } + public int getCode() { + return code; + } /** Returns the value of a named header. */ public String getHeader(String name) { return headers.get(name); } - public byte[] getContent() { return content; } + public byte[] getContent() { + return content; + } public Content toContent() { return new Content(orig, base, (content != null ? content : EMPTY_CONTENT), - getHeader(Response.CONTENT_TYPE), - headers, this.conf); + getHeader(Response.CONTENT_TYPE), headers, this.conf); } - + public FileResponse(URL url, WebPage page, File file, Configuration conf) - throws FileException, IOException { + throws FileException, IOException { this.orig = url.toString(); this.base = url.toString(); this.file = file; this.conf = conf; - + MIME = new MimeUtil(conf); if (!"file".equals(url.getProtocol())) @@ -117,22 +116,22 @@ public class FileResponse { path = java.net.URLDecoder.decode(path, "UTF-8"); } catch (UnsupportedEncodingException ex) { } - + try { this.content = null; // url.toURI() is only in j2se 1.5.0 - //java.io.File f = new java.io.File(url.toURI()); + // java.io.File f = new java.io.File(url.toURI()); java.io.File f = new java.io.File(path); if (!f.exists()) { - this.code = 404; // http Not Found + this.code = 404; // http Not Found return; } if (!f.canRead()) { - this.code = 401; // http Unauthorized + this.code = 401; // http Unauthorized return; } @@ -141,15 +140,17 @@ public class FileResponse { // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers - //hdrs.put("Location", f.getCanonicalFile().toURI()); - headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString()); + // hdrs.put("Location", f.getCanonicalFile().toURI()); + headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL() + .toString()); - this.code = 300; // http redirect + this.code = 300; // http redirect return; } if (f.lastModified() <= page.getModifiedTime()) { this.code = 304; - this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified())); + this.headers.set("Last-Modified", + HttpDateFormat.toString(f.lastModified())); return; } @@ -169,45 +170,46 @@ public class FileResponse { } // get file as http response - private void getFileAsHttpResponse(java.io.File f) - throws FileException, IOException { + private void getFileAsHttpResponse(java.io.File f) throws FileException, + IOException { // ignore file of size larger than // Integer.MAX_VALUE = 2^31-1 = 2147483647 long size = f.length(); if (size > Integer.MAX_VALUE) { - throw new FileException("file is too large, size: "+size); + throw new FileException("file is too large, size: " + size); // or we can do this? - // this.code = 400; // http Bad request + // this.code = 400; // http Bad request // return; } // capture content int len = (int) size; - + if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength) len = this.file.maxContentLength; this.content = new byte[len]; java.io.InputStream is = new java.io.FileInputStream(f); - int offset = 0; int n = 0; + int offset = 0; + int n = 0; while (offset < len - && (n = is.read(this.content, offset, len-offset)) >= 0) { + && (n = is.read(this.content, offset, len - offset)) >= 0) { offset += n; } if (offset < len) { // keep whatever already have, but issue a warning if (File.LOG.isWarnEnabled()) { - File.LOG.warn("not enough bytes read from file: "+f.getPath()); + File.LOG.warn("not enough bytes read from file: " + f.getPath()); } } - is.close(); + is.close(); // set headers headers.set(Response.CONTENT_LENGTH, new Long(size).toString()); - headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f - .lastModified())); - + headers.set(Response.LAST_MODIFIED, + HttpDateFormat.toString(f.lastModified())); + String mimeType = MIME.getMimeType(f); String mimeTypeString = mimeType != null ? mimeType.toString() : ""; headers.set(Response.CONTENT_TYPE, mimeTypeString); @@ -217,33 +219,33 @@ public class FileResponse { } // get dir list as http response - private void getDirAsHttpResponse(java.io.File f) - throws IOException { + private void getDirAsHttpResponse(java.io.File f) throws IOException { String path = f.toString(); if (this.file.crawlParents) - this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); + this.content = list2html(f.listFiles(), path, "/".equals(path) ? false + : true); else - this.content = list2html(f.listFiles(), path, false); + this.content = list2html(f.listFiles(), path, false); // set headers headers.set(Response.CONTENT_LENGTH, - new Integer(this.content.length).toString()); + new Integer(this.content.length).toString()); headers.set(Response.CONTENT_TYPE, "text/html"); headers.set(Response.LAST_MODIFIED, - HttpDateFormat.toString(f.lastModified())); + HttpDateFormat.toString(f.lastModified())); // response code this.code = 200; // http OK } // generate html page from dir list - private byte[] list2html(java.io.File[] list, - String path, boolean includeDotDot) { + private byte[] list2html(java.io.File[] list, String path, + boolean includeDotDot) { StringBuffer x = new StringBuffer("<html><head>"); - x.append("<title>Index of "+path+"</title></head>\n"); - x.append("<body><h1>Index of "+path+"</h1><pre>\n"); + x.append("<title>Index of " + path + "</title></head>\n"); + x.append("<body><h1>Index of " + path + "</h1><pre>\n"); if (includeDotDot) { x.append("<a href='../'>../</a>\t-\t-\t-\n"); @@ -252,20 +254,20 @@ public class FileResponse { // fix me: we might want to sort list here! but not now. java.io.File f; - for (int i=0; i<list.length; i++) { + for (int i = 0; i < list.length; i++) { f = list[i]; String name = f.getName(); String time = HttpDateFormat.toString(f.lastModified()); if (f.isDirectory()) { // java 1.4.2 api says dir itself and parent dir are not listed // so the following is not needed. - //if (name.equals(".") || name.equals("..")) - // continue; - x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t"); - x.append(time+"\t-\n"); + // if (name.equals(".") || name.equals("..")) + // continue; + x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t"); + x.append(time + "\t-\n"); } else if (f.isFile()) { - x.append("<a href='"+name+ "'>"+name+"</a>\t"); - x.append(time+"\t"+f.length()+"\n"); + x.append("<a href='" + name + "'>" + name + "</a>\t"); + x.append(time + "\t" + f.length() + "\n"); } else { // ignore any other } Modified: nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java (original) +++ nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java Fri Jan 9 06:34:33 2015 @@ -37,9 +37,10 @@ import static org.junit.Assert.*; * @author mattmann * @version $Revision$ * - * <p> - * Unit tests for the {@link File}Protocol. - * </p>. + * <p> + * Unit tests for the {@link File}Protocol. + * </p> + * . */ public class TestProtocolFile { @@ -47,12 +48,13 @@ public class TestProtocolFile { private String sampleDir = System.getProperty("test.data", "."); private static final String[] testTextFiles = new String[] { - "testprotocolfile.txt", "testprotocolfile_(encoded).txt", "testprotocolfile_%28encoded%29.txt" }; + "testprotocolfile.txt", "testprotocolfile_(encoded).txt", + "testprotocolfile_%28encoded%29.txt" }; private static final String expectedMimeType = "text/plain"; - + private Configuration conf; - + @Before public void setUp() { conf = NutchConfiguration.create(); @@ -64,11 +66,11 @@ public class TestProtocolFile { setContentType(testTextFile); } } - + /** - * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata - * field. - * @throws ProtocolNotFound + * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field. + * + * @throws ProtocolNotFound * * @since NUTCH-384 * @@ -78,19 +80,19 @@ public class TestProtocolFile { assertNotNull(urlString); WebPage datum = WebPage.newBuilder().build(); Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - ProtocolOutput output = protocol.getProtocolOutput(urlString,datum); + ProtocolOutput output = protocol.getProtocolOutput(urlString, datum); assertNotNull(output); assertEquals("Status code: [" + output.getStatus().getCode() + "], not equal to: [" + ProtocolStatusCodes.SUCCESS + "]: args: [" - + output.getStatus().getArgs() + "]", (Integer) ProtocolStatusCodes.SUCCESS, output - .getStatus().getCode()); + + output.getStatus().getArgs() + "]", + (Integer) ProtocolStatusCodes.SUCCESS, output.getStatus().getCode()); assertNotNull(output.getContent()); assertNotNull(output.getContent().getContentType()); assertEquals(expectedMimeType, output.getContent().getContentType()); assertNotNull(output.getContent().getMetadata()); - assertEquals(expectedMimeType, output.getContent().getMetadata().get( - Response.CONTENT_TYPE)); + assertEquals(expectedMimeType, + output.getContent().getMetadata().get(Response.CONTENT_TYPE)); }
