s...

lewismc Thu, 08 Jan 2015 22:35:05 -0800

Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
 Fri Jan  9 06:34:33 2015
@@ -46,377 +46,342 @@ import java.util.StringTokenizer;
  */
 public class DOMContentUtilsTest {
 
-       private static final String[] testPages = {
-                       // 0.
-                       new String(
-                                       "<html><head><title> title 
</title><script> script </script>"
-                                                       + "</head><body> body 
<a href=\"http://www.nutch.org\";>"
-                                                       + " anchor 
</a><!--comment-->" + "</body></html>"),
-                       // 1.
-                       new String(
-                                       "<html><head><title> title 
</title><script> script </script>"
-                                                       + "</head><body> body 
<a href=\"/\">"
-                                                       + " home 
</a><!--comment-->"
-                                                       + "<style> style 
</style>"
-                                                       + " <a 
href=\"bot.html\">" + " bots </a>"
-                                                       + "</body></html>"),
-                       // 2.
-                       new String("<html><head><title> </title>" + 
"</head><body> "
-                                       + "<a href=\"/\"> separate this "
-                                       + "<a href=\"ok\"> from this" + 
"</a></a>"
-                                       + "</body></html>"),
-                       // 3.
-                       // this one relies on certain neko fixup behavior, 
possibly
-                       // distributing the anchors into the LI's-but not the 
other
-                       // anchors (outside of them, instead)! So you get a 
tree that
-                       // looks like:
-                       // ... <li> <a href=/> home </a> </li>
-                       // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
-                       // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> 
</a> </a> </li>
-                       new String("<html><head><title> my title </title>"
-                                       + "</head><body> body " + "<ul>"
-                                       + "<li> <a href=\"/\"> home" + "<li> <a 
href=\"1\"> 1"
-                                       + "<li> <a href=\"2\"> 2" + "</ul>" + 
"</body></html>"),
-                       // 4.
-                       // test frameset link extraction. The invalid frame in 
the middle
-                       // will be
-                       // fixed to a third standalone frame.
-                       new String("<html><head><title> my title </title>"
-                                       + "</head><frameset rows=\"20,*\"> "
-                                       + "<frame src=\"top.html\">" + 
"</frame>"
-                                       + "<frameset cols=\"20,*\">" + "<frame 
src=\"left.html\">"
-                                       + "</frame>" + "<frame 
src=\"invalid.html\"/>" + "</frame>"
-                                       + "<frame src=\"right.html\">" + 
"</frame>" + "</frameset>"
-                                       + "</frameset>" + "</body></html>"),
-                       // 5.
-                       // test <area> and <iframe> link extraction + url 
normalization
-                       new String(
-                                       "<html><head><title> my title </title>"
-                                                       + "</head><body>"
-                                                       + "<img 
src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
-                                                       + "<map name=\"green\">"
-                                                       + "<area 
shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
-                                                       + "<area shape=\"rect\" 
coords=\"128,132,241,179\" href=\"#bottom\">"
-                                                       + "<area 
shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
-                                                       + "</map>"
-                                                       + "<a 
name=\"bottom\"/><h1> the bottom </h1> "
-                                                       + "<iframe 
src=\"../docs/index.html\"/>"
-                                                       + "</body></html>"),
-                       // 6.
-                       // test whitespace processing for plain text extraction
-                       new String(
-                                       "<html><head>\n <title> my\t\n  
title\r\n </title>\n"
-                                                       + " </head>\n"
-                                                       + " <body>\n"
-                                                       + "    <h1> 
Whitespace\ttest  </h1> \n"
-                                                       + "\t<a 
href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
-                                                       + "    <p> This 
is<span> a whitespace<span></span> test</span>. Newlines\n"
-                                                       + "should appear as 
space too.</p><p>Tabs\tare spaces too.\n</p>"
-                                                       + "    This\t<b>is 
a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
-                                                       + "<table>"
-                                                       + "    
<tr><td>one</td><td>two</td><td>three</td></tr>\n"
-                                                       + "    <tr><td>space 
here </td><td> space there</td><td>no space</td></tr>"
-                                                       + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
-                                                       + "</table>put some 
text here<Br>and there."
-                                                       + 
"<h2>End\tthis\rmadness\n!</h2>\r\n"
-                                                       + "         .        .  
      .         ."
-                                                       + "</body>  </html>"),
-                       // 7.
-                       // test that <a rel=nofollow> links are not returned
-                       new String(
-                                       "<html><head></head><body>"
-                                                       + "<a 
href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
-                                                       + "<a rel=\"nofollow\" 
href=\"http://www.nutch.org\";> ignore </a>"
-                                                       + "</body></html>"),
-                       // 8.
-                       // test that POST form actions are skipped
-                       new String(
-                                       "<html><head></head><body>"
-                                                       + "<form method='POST' 
action='/search.jsp'><input type=text>"
-                                                       + "<input 
type=submit><p>test1</p></form>"
-                                                       + "<form method='GET' 
action='/dummy.jsp'><input type=text>"
-                                                       + "<input 
type=submit><p>test2</p></form></body></html>"),
-                       // 9.
-                       // test that all form actions are skipped
-                       new String(
-                                       "<html><head></head><body>"
-                                                       + "<form method='POST' 
action='/search.jsp'><input type=text>"
-                                                       + "<input 
type=submit><p>test1</p></form>"
-                                                       + "<form method='GET' 
action='/dummy.jsp'><input type=text>"
-                                                       + "<input 
type=submit><p>test2</p></form></body></html>"),
-                       // 10.
-                       new String("<html><head><title> title </title>" + 
"</head><body>"
-                                       + "<a href=\";x\">anchor1</a>"
-                                       + "<a href=\"g;x\">anchor2</a>"
-                                       + "<a href=\"g;x?y#s\">anchor3</a>" + 
"</body></html>"),
-                       // 11.
-                       new String("<html><head><title> title </title>" + 
"</head><body>"
-                                       + "<a href=\"g\">anchor1</a>"
-                                       + "<a href=\"g?y#s\">anchor2</a>"
-                                       + "<a href=\"?y=1\">anchor3</a>"
-                                       + "<a href=\"?y=1#s\">anchor4</a>"
-                                       + "<a 
href=\"?y=1;somethingelse\">anchor5</a>"
-                                       + "</body></html>"), };
-
-       private static int SKIP = 9;
-
-       private static String[] testBaseHrefs = { "http://www.nutch.org";,
-                       "http://www.nutch.org/docs/foo.html";, 
"http://www.nutch.org/docs/";,
-                       "http://www.nutch.org/docs/";, 
"http://www.nutch.org/frames/";,
-                       "http://www.nutch.org/maps/";, 
"http://www.nutch.org/whitespace/";,
-                       "http://www.nutch.org//";, "http://www.nutch.org/";,
-                       "http://www.nutch.org/";, "http://www.nutch.org/";,
-                       "http://www.nutch.org/;something"; };
-
-       private static final DocumentFragment testDOMs[] = new 
DocumentFragment[testPages.length];
-
-       private static URL[] testBaseHrefURLs = new URL[testPages.length];
-
-       private static final String[] answerText = {
-                       "body anchor",
-                       "body home bots",
-                       "separate this from this",
-                       "body home 1 2",
-                       "",
-                       "the bottom",
-                       "Whitespace test whitespace test "
-                                       + "This is a whitespace test . Newlines 
should appear as space too. "
-                                       + "Tabs are spaces too. This is a break 
-> and the line after break . "
-                                       + "one two three space here space there 
no space "
-                                       + "one two two three three four put 
some text here and there. "
-                                       + "End this madness ! . . . .", "ignore 
ignore",
-                       "test1 test2", "test1 test2", "anchor1 anchor2 anchor3",
-                       "anchor1 anchor2 anchor3 anchor4 anchor5" };
-
-       private static final String[] answerTitle = { "title", "title", "",
-                       "my title", "my title", "my title", "my title", "", "", 
"",
-                       "title", "title" };
-
-       // note: should be in page-order
-       private static Outlink[][] answerOutlinks;
-
-       private static Configuration conf;
-       private static DOMContentUtils utils = null;
-       
-       public static final Logger Logger = 
LoggerFactory.getLogger(DOMContentUtilsTest.class);
-
-       public DOMContentUtilsTest(String name) {
-       }
-
-       private static void setup() throws Exception {
-               conf = NutchConfiguration.create();
-               conf.setBoolean("parser.html.form.use_action", true);
-               utils = new DOMContentUtils(conf);
-               TikaParser tikaParser = new TikaParser();
-               tikaParser.setConf(conf);
-               Parser parser = 
tikaParser.getTikaConfig().getParser("text/html");
-               for (int i = 0; i < testPages.length; i++) {
-                       Metadata tikamd = new Metadata();
-
-                       HTMLDocumentImpl doc = new HTMLDocumentImpl();
-                       doc.setErrorChecking(false);
-                       DocumentFragment root = doc.createDocumentFragment();
-                       DOMBuilder domhandler = new DOMBuilder(doc, root);
-                       ParseContext context = new ParseContext();
-                       // to add once available in Tika
-                       //context.set(HtmlMapper.class, 
IdentityHtmlMapper.INSTANCE);
-                       try {
-                               parser.parse(new 
ByteArrayInputStream(testPages[i].getBytes()),
-                                               domhandler, tikamd, context);
-                               testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
-                       } catch (Exception e) {
-                               e.printStackTrace();
-                               fail("caught exception: " + e);
-                       }
-                       testDOMs[i] = root;
-                       LSSerializerImpl lsi = new LSSerializerImpl();
-                       System.out.println("input " + i + ": '" + testPages[i] 
+ "'");
-                       System.out.println("output " + i + ": '" + 
lsi.writeToString(root)
-                                       + "'");
-
-               }
-               answerOutlinks = new Outlink[][] {
-                               // 0
-                               { new Outlink("http://www.nutch.org";, 
"anchor"), },
-                               // 1
-                               {
-                                 new Outlink("http://www.nutch.org/";, "home"),
-                                 new 
Outlink("http://www.nutch.org/docs/bot.html";,
-                                                               "bots"), },
-                               // 2
-                               {
-                                       new Outlink("http://www.nutch.org/";, 
"separate this"),
-                                       new 
Outlink("http://www.nutch.org/docs/ok";, "from this"), },
-                               
-                               // 3    
-                               {   new Outlink("http://www.nutch.org/";, 
"home"),
-                                       new 
Outlink("http://www.nutch.org/docs/1";, "1"),
-                                       new 
Outlink("http://www.nutch.org/docs/2";, "2"), },
-                               // 4    
-                               {
-                                       new 
Outlink("http://www.nutch.org/frames/top.html";, ""),
-                                       new 
Outlink("http://www.nutch.org/frames/left.html";, ""),
-                                       new 
Outlink("http://www.nutch.org/frames/invalid.html","";),
-                                       new 
Outlink("http://www.nutch.org/frames/right.html","";), 
-                               },
-                               // 5
-                               { 
-                                       new 
Outlink("http://www.nutch.org/maps/logo.gif";, ""),
-                                       new 
Outlink("http://www.nutch.org/index.html";, ""),
-                                       new 
Outlink("http://www.nutch.org/maps/#bottom";, ""),
-                                       new 
Outlink("http://www.nutch.org/bot.html";, ""),
-                                       new 
Outlink("http://www.nutch.org/docs/index.html";, "") 
-                               },
-                               // 6
-                               { new Outlink("http://www.nutch.org/index.html";,
-                                               "whitespace test"), 
-                               },
-                               // 7
-                               {},
-                               // 8
-                               { new Outlink("http://www.nutch.org/dummy.jsp";, 
"test2"), },
-                               // 9
-                               {},
-                               // 10 
-                               { 
-                                new Outlink("http://www.nutch.org/;x";, 
"anchor1"),
-                                new Outlink("http://www.nutch.org/g;x";, 
"anchor2"),
-                                new Outlink("http://www.nutch.org/g;x?y#s";, 
"anchor3") 
-                               },
-                               // 11
-                               {
-                                // this is tricky - see RFC3986 section 5.4.1 
example 7
-                                new 
Outlink("http://www.nutch.org/g","anchor1";),
-                                new Outlink("http://www.nutch.org/g?y#s";, 
"anchor2"),
-                                new 
Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
-                                new 
Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
-                                new 
Outlink("http://www.nutch.org/;something?y=1;somethingelse";, "anchor5") }
-                               };
-
-       }
-
-       private static boolean equalsIgnoreWhitespace(String s1, String s2) {
-               StringTokenizer st1 = new StringTokenizer(s1);
-               StringTokenizer st2 = new StringTokenizer(s2);
-
-               while (st1.hasMoreTokens()) {
-                       if (!st2.hasMoreTokens()) {
-                        Logger.info("st1+ '" + st1.nextToken() + "'");
-                               return false;
-                       }
-                       String st1Token = st1.nextToken();
-                       String st2Token = st2.nextToken();
-                       if (!st1Token.equals(st2Token)) {
-                        Logger.info("st1:'" + st1Token + "' != st2:'" + 
st2Token + "'");
-                               return false;
-                       }
-               }
-               if (st2.hasMoreTokens()) {
-                       System.err.println("st2+ '" + st2.nextToken() + "'");
-                       return false;
-               }
-               return true;
-       }
-
-       @Test
-       public void testGetText() throws Exception {
-               if (testDOMs[0] == null)
-                       setup();
-               for (int i = 0; i < testPages.length; i++) {
-                       StringBuffer sb = new StringBuffer();
-                       utils.getText(sb, testDOMs[i]);
-                       String text = sb.toString();
-                       assertTrue(
-                                       "example " + i + " : expecting text: " 
+ answerText[i]
-                                                       + 
System.getProperty("line.separator")
-                                                       + 
System.getProperty("line.separator")
-                                                       + "got text: " + text,
-                                       equalsIgnoreWhitespace(answerText[i], 
text));
-               }
-       }
-
-       // won't work with Tika - the title is stored in the metadata but
-       // not put in the XHTML representation
-       @Test
-       public void testGetTitle() throws Exception {
-               if (testDOMs[0] == null)
-                       setup();
-               for (int i = 0; i < testPages.length; i++) {
-                       StringBuffer sb = new StringBuffer();
-                       utils.getTitle(sb, testDOMs[i]);
-                       String title = sb.toString();
-                       assertTrue(
-                                       "example " + i + " : expecting title: " 
+ answerTitle[i]
-                                                       + 
System.getProperty("line.separator")
-                                                       + 
System.getProperty("line.separator")
-                                                       + "got title: " + title,
-                                       equalsIgnoreWhitespace(answerTitle[i], 
title));
-               }
-       }
-
-       @Test
-       public void testGetOutlinks() throws Exception {
-               if (testDOMs[0] == null)
-                       setup();
-               for (int i = 0; i < testPages.length; i++) {
-                       ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
-                       if (i == SKIP) {
-                               conf.setBoolean("parser.html.form.use_action", 
false);
-                               utils.setConf(conf);
-                       } else {
-                               conf.setBoolean("parser.html.form.use_action", 
true);
-                               utils.setConf(conf);
-                       }
-                       utils.getOutlinks(testBaseHrefURLs[i], outlinks, 
testDOMs[i]);
-                       Outlink[] outlinkArr = new Outlink[outlinks.size()];
-                       outlinkArr = outlinks.toArray(outlinkArr);
-                       compareOutlinks(i, answerOutlinks[i], outlinkArr);
-               }
-       }
-
-       private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
-               for (int i = 0; i < o.length; i++) {
-                       sb.append(o[i].toString());
-                       sb.append(System.getProperty("line.separator"));
-               }
-       }
-
-       private static final String outlinksString(Outlink[] o) {
-               StringBuffer sb = new StringBuffer();
-               appendOutlinks(sb, o);
-               return sb.toString();
-       }
-
-       private static final void compareOutlinks(int test, Outlink[] o1,
-                       Outlink[] o2) {
-               if (o1.length != o2.length) {
-                       assertTrue(
-                                       "test " + test
-                                                       + ", got wrong number 
of outlinks (expecting "
-                                                       + o1.length + ", got " 
+ o2.length + ")"
-                                                       + 
System.getProperty("line.separator") + "answer: "
-                                                       + 
System.getProperty("line.separator")
-                                                       + outlinksString(o1)
-                                                       + 
System.getProperty("line.separator") + "got: "
-                                                       + 
System.getProperty("line.separator")
-                                                       + outlinksString(o2)
-                                                       + 
System.getProperty("line.separator"), false);
-               }
-
-               for (int i = 0; i < o1.length; i++) {
-                       if (!o1[i].equals(o2[i])) {
-                               assertTrue(
-                                               "test " + test + ", got wrong 
outlinks at position "
-                                                               + i + 
System.getProperty("line.separator")
-                                                               + "answer: "
-                                                               + 
System.getProperty("line.separator")
-                                                               + 
o1[i].toString()
-                                                               + 
System.getProperty("line.separator")
-                                                               + "got: "
-                                                               + 
System.getProperty("line.separator")
-                                                               + 
o2[i].toString(), false);
-
-                       }
-               }
-       }
+  private static final String[] testPages = {
+      // 0.
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\";>"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+      // 1.
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+      // 2.
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+      // 3.
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+      // 4.
+      // test frameset link extraction. The invalid frame in the middle
+      // will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "</frame>"
+          + "<frame src=\"invalid.html\"/>" + "</frame>"
+          + "<frame src=\"right.html\">" + "</frame>" + "</frameset>"
+          + "</frameset>" + "</body></html>"),
+      // 5.
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" 
href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" 
href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+      // 6.
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  
\t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. 
Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> 
break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no 
space</td></tr>"
+              + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+      // 7.
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
+          + "</body></html>"),
+      // 8.
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // 9.
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // 10.
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      // 11.
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), 
};
+
+  private static int SKIP = 9;
+
+  private static String[] testBaseHrefs = { "http://www.nutch.org";,
+      "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
+      "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
+      "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
+      "http://www.nutch.org//";, "http://www.nutch.org/";,
+      "http://www.nutch.org/";, "http://www.nutch.org/";,
+      "http://www.nutch.org/;something"; };
+
+  private static final DocumentFragment testDOMs[] = new 
DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "body anchor",
+      "body home bots",
+      "separate this from this",
+      "body home 1 2",
+      "",
+      "the bottom",
+      "Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break 
. "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "anchor1 anchor2 anchor3",
+      "anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title" };
+
+  // note: should be in page-order
+  private static Outlink[][] answerOutlinks;
+
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+
+  public static final Logger Logger = LoggerFactory
+      .getLogger(DOMContentUtilsTest.class);
+
+  public DOMContentUtilsTest(String name) {
+  }
+
+  private static void setup() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
+    TikaParser tikaParser = new TikaParser();
+    tikaParser.setConf(conf);
+    Parser parser = tikaParser.getTikaConfig().getParser("text/html");
+    for (int i = 0; i < testPages.length; i++) {
+      Metadata tikamd = new Metadata();
+
+      HTMLDocumentImpl doc = new HTMLDocumentImpl();
+      doc.setErrorChecking(false);
+      DocumentFragment root = doc.createDocumentFragment();
+      DOMBuilder domhandler = new DOMBuilder(doc, root);
+      ParseContext context = new ParseContext();
+      // to add once available in Tika
+      // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+      try {
+        parser.parse(new ByteArrayInputStream(testPages[i].getBytes()),
+            domhandler, tikamd, context);
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+      } catch (Exception e) {
+        e.printStackTrace();
+        fail("caught exception: " + e);
+      }
+      testDOMs[i] = root;
+      LSSerializerImpl lsi = new LSSerializerImpl();
+      System.out.println("input " + i + ": '" + testPages[i] + "'");
+      System.out.println("output " + i + ": '" + lsi.writeToString(root) + 
"'");
+
+    }
+    answerOutlinks = new Outlink[][] {
+        // 0
+        { new Outlink("http://www.nutch.org";, "anchor"), },
+        // 1
+        { new Outlink("http://www.nutch.org/";, "home"),
+            new Outlink("http://www.nutch.org/docs/bot.html";, "bots"), },
+        // 2
+        { new Outlink("http://www.nutch.org/";, "separate this"),
+            new Outlink("http://www.nutch.org/docs/ok";, "from this"), },
+
+        // 3
+        { new Outlink("http://www.nutch.org/";, "home"),
+            new Outlink("http://www.nutch.org/docs/1";, "1"),
+            new Outlink("http://www.nutch.org/docs/2";, "2"), },
+        // 4
+        { new Outlink("http://www.nutch.org/frames/top.html";, ""),
+            new Outlink("http://www.nutch.org/frames/left.html";, ""),
+            new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
+            new Outlink("http://www.nutch.org/frames/right.html";, ""), },
+        // 5
+        { new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
+            new Outlink("http://www.nutch.org/index.html";, ""),
+            new Outlink("http://www.nutch.org/maps/#bottom";, ""),
+            new Outlink("http://www.nutch.org/bot.html";, ""),
+            new Outlink("http://www.nutch.org/docs/index.html";, "") },
+        // 6
+        { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), },
+        // 7
+        {},
+        // 8
+        { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
+        // 9
+        {},
+        // 10
+        { new Outlink("http://www.nutch.org/;x";, "anchor1"),
+            new Outlink("http://www.nutch.org/g;x";, "anchor2"),
+            new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3") },
+        // 11
+        {
+            // this is tricky - see RFC3986 section 5.4.1 example 7
+            new Outlink("http://www.nutch.org/g";, "anchor1"),
+            new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
+            new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
+            new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
+            new Outlink("http://www.nutch.org/;something?y=1;somethingelse";,
+                "anchor5") } };
+
+  }
+
+  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
+
+    while (st1.hasMoreTokens()) {
+      if (!st2.hasMoreTokens()) {
+        Logger.info("st1+ '" + st1.nextToken() + "'");
+        return false;
+      }
+      String st1Token = st1.nextToken();
+      String st2Token = st2.nextToken();
+      if (!st1Token.equals(st2Token)) {
+        Logger.info("st1:'" + st1Token + "' != st2:'" + st2Token + "'");
+        return false;
+      }
+    }
+    if (st2.hasMoreTokens()) {
+      System.err.println("st2+ '" + st2.nextToken() + "'");
+      return false;
+    }
+    return true;
+  }
+
+  @Test
+  public void testGetText() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getText(sb, testDOMs[i]);
+      String text = sb.toString();
+      assertTrue(
+          "example " + i + " : expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerText[i], text));
+    }
+  }
+
+  // won't work with Tika - the title is stored in the metadata but
+  // not put in the XHTML representation
+  @Test
+  public void testGetTitle() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getTitle(sb, testDOMs[i]);
+      String title = sb.toString();
+      assertTrue(
+          "example " + i + " : expecting title: " + answerTitle[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got title: " + title,
+          equalsIgnoreWhitespace(answerTitle[i], title));
+    }
+  }
+
+  @Test
+  public void testGetOutlinks() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = outlinks.toArray(outlinkArr);
+      compareOutlinks(i, answerOutlinks[i], outlinkArr);
+    }
+  }
+
+  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+    for (int i = 0; i < o.length; i++) {
+      sb.append(o[i].toString());
+      sb.append(System.getProperty("line.separator"));
+    }
+  }
+
+  private static final String outlinksString(Outlink[] o) {
+    StringBuffer sb = new StringBuffer();
+    appendOutlinks(sb, o);
+    return sb.toString();
+  }
+
+  private static final void compareOutlinks(int test, Outlink[] o1, Outlink[] 
o2) {
+    if (o1.length != o2.length) {
+      assertTrue(
+          "test " + test + ", got wrong number of outlinks (expecting "
+              + o1.length + ", got " + o2.length + ")"
+              + System.getProperty("line.separator") + "answer: "
+              + System.getProperty("line.separator") + outlinksString(o1)
+              + System.getProperty("line.separator") + "got: "
+              + System.getProperty("line.separator") + outlinksString(o2)
+              + System.getProperty("line.separator"), false);
+    }
+
+    for (int i = 0; i < o1.length; i++) {
+      if (!o1[i].equals(o2[i])) {
+        assertTrue(
+            "test " + test + ", got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + o1[i].toString()
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + o2[i].toString(),
+            false);
+
+      }
+    }
+  }
 }


Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
 Fri Jan  9 06:34:33 2015
@@ -39,9 +39,9 @@ public class TestImageMetadata {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data",".");
+  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  private String[] sampleFiles = {"nutch_logo_tm.gif"};
+  private String[] sampleFiles = { "nutch_logo_tm.gif" };
 
   @Test
   public void testIt() throws ProtocolException, ParseException, IOException {
@@ -50,32 +50,32 @@ public class TestImageMetadata {
     Parse parse;
     Configuration conf = NutchConfiguration.create();
     MimeUtil mimeutil = new MimeUtil(conf);
-    
+
     for (int i = 0; i < sampleFiles.length; i++) {
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-      
+
       File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
       byte[] bytes = new byte[(int) file.length()];
       DataInputStream in = new DataInputStream(new FileInputStream(file));
       in.readFully(bytes);
       in.close();
-      
+
       WebPage page = WebPage.newBuilder().build();
       page.setBaseUrl(new Utf8(urlString));
       page.setContent(ByteBuffer.wrap(bytes));
       String mtype = mimeutil.getMimeType(file);
       page.setContentType(new Utf8(mtype));
-      
+
       parse = new ParseUtil(conf).parse(urlString, page);
-      
-      //assert width
+
+      // assert width
       ByteBuffer bbufW = page.getMetadata().get(new Utf8("width"));
       byte[] byteArrayW = new byte[bbufW.remaining()];
       bbufW.get(byteArrayW);
       String width = new String(byteArrayW);
       assertEquals("121", width);
-      
-      //assert height
+
+      // assert height
       ByteBuffer bbufH = page.getMetadata().get(new Utf8("height"));
       byte[] byteArrayH = new byte[bbufH.remaining()];
       bbufH.get(byteArrayH);

Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
 Fri Jan  9 06:34:33 2015
@@ -44,63 +44,63 @@ import static org.junit.Assert.assertTru
  */
 public class TestMSWordParser {
 
-    private String fileSeparator = System.getProperty("file.separator");
-    // This system property is defined in ./src/plugin/build-plugin.xml
-    private String sampleDir = System.getProperty("test.data", ".");
-    // Make sure sample files are copied to "test.data" as specified in
-    // ./src/plugin/parse-msword/build.xml during plugin compilation.
-    // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
-    private String[] sampleFiles = { "word97.doc" };
-
-    private String expectedText = "This is a sample doc file prepared for 
nutch.";
-
-    private Configuration conf;
-
-    @Before
-    public void setUp() {
-       conf = NutchConfiguration.create();
-       conf.set("file.content.limit", "-1");
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-msword/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+  private String[] sampleFiles = { "word97.doc" };
+
+  private String expectedText = "This is a sample doc file prepared for 
nutch.";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
+
+  public String getTextContent(String fileName) throws ProtocolException,
+      ParseException, IOException {
+    String urlString = sampleDir + fileSeparator + fileName;
+
+    File file = new File(urlString);
+    byte[] bytes = new byte[(int) file.length()];
+    DataInputStream in = new DataInputStream(new FileInputStream(file));
+    in.readFully(bytes);
+    in.close();
+    Parse parse;
+    WebPage page = WebPage.newBuilder().build();
+    page.setBaseUrl(new Utf8("file:" + urlString));
+    page.setContent(ByteBuffer.wrap(bytes));
+    // set the content type?
+    MimeUtil mimeutil = new MimeUtil(conf);
+    String mtype = mimeutil.getMimeType(file);
+    page.setContentType(new Utf8(mtype));
+
+    parse = new ParseUtil(conf).parse("file:" + urlString, page);
+    return parse.getText();
+  }
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException, IOException {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      String found = getTextContent(sampleFiles[i]);
+      assertTrue("text found : '" + found + "'", 
found.startsWith(expectedText));
     }
+  }
 
-    public String getTextContent(String fileName) throws ProtocolException,
-           ParseException, IOException {
-       String urlString = sampleDir + fileSeparator + fileName;
-
-       File file = new File(urlString);
-       byte[] bytes = new byte[(int) file.length()];
-       DataInputStream in = new DataInputStream(new FileInputStream(file));
-       in.readFully(bytes);
-       in.close();
-       Parse parse;
-       WebPage page = WebPage.newBuilder().build();
-       page.setBaseUrl(new Utf8("file:"+urlString));
-       page.setContent(ByteBuffer.wrap(bytes));
-       // set the content type?
-       MimeUtil mimeutil = new MimeUtil(conf);
-       String mtype = mimeutil.getMimeType(file);
-       page.setContentType(new Utf8(mtype));
-               
-       parse = new ParseUtil(conf).parse("file:"+urlString, page);
-       return parse.getText();
-    }
-
-    @Test
-    public void testIt() throws ProtocolException, ParseException, IOException 
{
-       for (int i = 0; i < sampleFiles.length; i++) {
-           String found = getTextContent(sampleFiles[i]);
-           assertTrue("text found : '" + found + "'", found
-                   .startsWith(expectedText));
-       }
-    }
-
-    @Test
-    public void testOpeningDocs() throws ProtocolException, ParseException, 
IOException {
-       String[] filenames = new File(sampleDir).list();
-       for (int i = 0; i < filenames.length; i++) {
-           if (filenames[i].endsWith(".doc") == false)
-               continue;
-           assertTrue("cann't read content of " + filenames[i],
-                   getTextContent(filenames[i]).length() > 0);
-       }
+  @Test
+  public void testOpeningDocs() throws ProtocolException, ParseException,
+      IOException {
+    String[] filenames = new File(sampleDir).list();
+    for (int i = 0; i < filenames.length; i++) {
+      if (filenames[i].endsWith(".doc") == false)
+        continue;
+      assertTrue("cann't read content of " + filenames[i],
+          getTextContent(filenames[i]).length() > 0);
     }
+  }
 }

Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
 Fri Jan  9 06:34:33 2015
@@ -57,11 +57,11 @@ public class TestOOParser {
     Parse parse;
     Configuration conf = NutchConfiguration.create();
     MimeUtil mimeutil = new MimeUtil(conf);
-       
+
     try {
       // read the test string
       FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
-        + sampleText);
+          + sampleText);
       StringBuffer sb = new StringBuffer();
       int len = 0;
       InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
@@ -83,7 +83,7 @@ public class TestOOParser {
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       if (sampleFiles[i].startsWith("ootest") == false)
-      continue;
+        continue;
 
       File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
       byte[] bytes = new byte[(int) file.length()];

Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
 Fri Jan  9 06:34:33 2015
@@ -43,43 +43,43 @@ import static org.junit.Assert.assertTru
  */
 public class TestPdfParser {
 
-    private String fileSeparator = System.getProperty("file.separator");
-    // This system property is defined in ./src/plugin/build-plugin.xml
-    private String sampleDir = System.getProperty("test.data", ".");
-    // Make sure sample files are copied to "test.data" as specified in
-    // ./src/plugin/parse-pdf/build.xml during plugin compilation.
-    // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
-    private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
-
-    private String expectedText = "A VERY SMALL PDF FILE";
-
-    @Test
-    public void testIt() throws ProtocolException, ParseException, IOException 
{
-       String urlString;
-       Parse parse;
-       Configuration conf = NutchConfiguration.create();
-       MimeUtil mimeutil = new MimeUtil(conf);
-
-       for (int i = 0; i < sampleFiles.length; i++) {
-           urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-           File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
-           byte[] bytes = new byte[(int) file.length()];
-           DataInputStream in = new DataInputStream(new FileInputStream(file));
-           in.readFully(bytes);
-           in.close();
-
-           WebPage page = WebPage.newBuilder().build();
-           page.setBaseUrl(new Utf8(urlString));
-           page.setContent(ByteBuffer.wrap(bytes));
-           String mtype = mimeutil.getMimeType(file);
-           page.setContentType(new Utf8(mtype));
-
-           parse = new ParseUtil(conf).parse(urlString, page);
-
-           int index = parse.getText().indexOf(expectedText);
-           assertTrue(index > 0);
-       }
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+  private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+  private String expectedText = "A VERY SMALL PDF FILE";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException, IOException {
+    String urlString;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    MimeUtil mimeutil = new MimeUtil(conf);
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+      byte[] bytes = new byte[(int) file.length()];
+      DataInputStream in = new DataInputStream(new FileInputStream(file));
+      in.readFully(bytes);
+      in.close();
+
+      WebPage page = WebPage.newBuilder().build();
+      page.setBaseUrl(new Utf8(urlString));
+      page.setContent(ByteBuffer.wrap(bytes));
+      String mtype = mimeutil.getMimeType(file);
+      page.setContentType(new Utf8(mtype));
+
+      parse = new ParseUtil(conf).parse(urlString, page);
+
+      int index = parse.getText().indexOf(expectedText);
+      assertTrue(index > 0);
     }
+  }
 
 }

Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
 Fri Jan  9 06:34:33 2015
@@ -68,7 +68,7 @@ public class TestRSSParser {
    * </ul>
    */
   @Test
-  public void testIt()throws ProtocolException, ParseException, IOException {
+  public void testIt() throws ProtocolException, ParseException, IOException {
     String urlString;
     Parse parse;
 

Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
 Fri Jan  9 06:34:33 2015
@@ -44,50 +44,49 @@ import static org.junit.Assert.assertEqu
  */
 public class TestRTFParser {
 
-    private String fileSeparator = System.getProperty("file.separator");
-    // This system property is defined in ./src/plugin/build-plugin.xml
-    private String sampleDir = System.getProperty("test.data", ".");
-    // Make sure sample files are copied to "test.data" as specified in
-    // ./src/plugin/parse-rtf/build.xml during plugin compilation.
-    // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
-    private String rtfFile = "test.rtf";
-
-    @Test
-    public void testIt() throws ProtocolException, ParseException, IOException 
{
-
-       String urlString;
-       Parse parse;
-       Configuration conf = NutchConfiguration.create();
-       MimeUtil mimeutil = new MimeUtil(conf);
-
-       urlString = "file:" + sampleDir + fileSeparator + rtfFile;
-
-       File file = new File(sampleDir + fileSeparator + rtfFile);
-       byte[] bytes = new byte[(int) file.length()];
-       DataInputStream in = new DataInputStream(new FileInputStream(file));
-       in.readFully(bytes);
-       in.close();
-
-       WebPage page = WebPage.newBuilder().build();
-       page.setBaseUrl(new Utf8(urlString));
-       page.setContent(ByteBuffer.wrap(bytes));
-       String mtype = mimeutil.getMimeType(file);
-       page.setContentType(new Utf8(mtype));
-
-       parse = new ParseUtil(conf).parse(urlString, page);
-
-       String title = parse.getTitle();
-       String text = parse.getText();
-       assertEquals("test rft document", title);
-       //assertEquals("The quick brown fox jumps over the lazy dog", 
text.trim());
-
-       
-       // HOW DO WE GET THE PARSE METADATA?
-       // Metadata meta = parse();
-
-       // METADATA extraction is not yet supported in Tika
-       // 
-       // assertEquals("tests", meta.get(DublinCore.SUBJECT));
-    }
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+  private String rtfFile = "test.rtf";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException, IOException {
+
+    String urlString;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    MimeUtil mimeutil = new MimeUtil(conf);
+
+    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+
+    File file = new File(sampleDir + fileSeparator + rtfFile);
+    byte[] bytes = new byte[(int) file.length()];
+    DataInputStream in = new DataInputStream(new FileInputStream(file));
+    in.readFully(bytes);
+    in.close();
+
+    WebPage page = WebPage.newBuilder().build();
+    page.setBaseUrl(new Utf8(urlString));
+    page.setContent(ByteBuffer.wrap(bytes));
+    String mtype = mimeutil.getMimeType(file);
+    page.setContentType(new Utf8(mtype));
+
+    parse = new ParseUtil(conf).parse(urlString, page);
+
+    String title = parse.getTitle();
+    String text = parse.getText();
+    assertEquals("test rft document", title);
+    // assertEquals("The quick brown fox jumps over the lazy dog", 
text.trim());
+
+    // HOW DO WE GET THE PARSE METADATA?
+    // Metadata meta = parse();
+
+    // METADATA extraction is not yet supported in Tika
+    //
+    // assertEquals("tests", meta.get(DublinCore.SUBJECT));
+  }
 
 }

Modified: 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Fri Jan  9 06:34:33 2015
@@ -32,10 +32,11 @@ import java.util.Collection;
 import java.util.HashSet;
 
 /**
- * This class is a protocol plugin used for file: scheme.
- * It creates {@link FileResponse} object and gets the content of the url from 
it.
- * Configurable parameters are {@code file.content.limit} and {@code 
file.crawl.parent} 
- * in nutch-default.xml defined under "file properties" section.
+ * This class is a protocol plugin used for file: scheme. It creates
+ * {@link FileResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code file.content.limit} and
+ * {@code file.crawl.parent} in nutch-default.xml defined under
+ * "file properties" section.
  */
 public class File implements Protocol {
 
@@ -51,7 +52,7 @@ public class File implements Protocol {
   static final int MAX_REDIRECTS = 5;
 
   int maxContentLength;
-  
+
   boolean crawlParents;
 
   /**
@@ -63,7 +64,8 @@ public class File implements Protocol {
   private Configuration conf;
 
   // constructor
-  public File() { }
+  public File() {
+  }
 
   /**
    * Set the {@link Configuration} object
@@ -75,29 +77,32 @@ public class File implements Protocol {
     this.symlinksAsRedirects = conf.getBoolean(
         "file.crawl.redirect_noncanonical", true);
   }
-  
+
   /**
    * Get the {@link Configuration} object
    */
   public Configuration getConf() {
     return this.conf;
   }
-    
-  /** 
-   * Set the point at which content is truncated. 
+
+  /**
+   * Set the point at which content is truncated.
    */
   public void setMaxContentLength(int maxContentLength) {
     this.maxContentLength = maxContentLength;
   }
-  
-  /** 
-   * Creates a {@link FileResponse} object corresponding to the url and 
-   * return a {@link ProtocolOutput} object as per the content received
+
+  /**
+   * Creates a {@link FileResponse} object corresponding to the url and return 
a
+   * {@link ProtocolOutput} object as per the content received
    * 
-   * @param url Text containing the url
-   * @param datum The CrawlDatum object corresponding to the url
+   * @param url
+   *          Text containing the url
+   * @param datum
+   *          The CrawlDatum object corresponding to the url
    * 
-   * @return {@link ProtocolOutput} object for the content of the file 
indicated by url
+   * @return {@link ProtocolOutput} object for the content of the file 
indicated
+   *         by url
    */
   public ProtocolOutput getProtocolOutput(String url, WebPage page) {
     String urlString = url.toString();
@@ -115,13 +120,16 @@ public class File implements Protocol {
           return new ProtocolOutput(response.toContent()); // return it
 
         } else if (code == 304) { // got not modified
-          return new ProtocolOutput(response.toContent(), 
ProtocolStatusUtils.STATUS_NOTMODIFIED);
+          return new ProtocolOutput(response.toContent(),
+              ProtocolStatusUtils.STATUS_NOTMODIFIED);
 
         } else if (code == 401) { // access denied / no read permissions
-          return new ProtocolOutput(response.toContent(), 
ProtocolStatusUtils.makeStatus(ProtocolStatusUtils.ACCESS_DENIED));
+          return new ProtocolOutput(response.toContent(),
+              
ProtocolStatusUtils.makeStatus(ProtocolStatusUtils.ACCESS_DENIED));
 
         } else if (code == 404) { // no such file
-          return new ProtocolOutput(response.toContent(), 
ProtocolStatusUtils.STATUS_NOTFOUND);
+          return new ProtocolOutput(response.toContent(),
+              ProtocolStatusUtils.STATUS_NOTFOUND);
 
         } else if (code >= 300 && code < 400) { // handle redirect
           u = new URL(response.getHeader("Location"));
@@ -156,8 +164,8 @@ public class File implements Protocol {
     return FIELDS;
   }
 
-  /** 
-   * Quick way for running this class. Useful for debugging. 
+  /**
+   * Quick way for running this class. Useful for debugging.
    */
   public static void main(String[] args) throws Exception {
     int maxContentLength = Integer.MIN_VALUE;
@@ -216,11 +224,11 @@ public class File implements Protocol {
     file = null;
   }
 
-  /** 
-   * No robots parsing is done for file protocol. 
-   * So this returns a set of empty rules which will allow every url.
+  /**
+   * No robots parsing is done for file protocol. So this returns a set of 
empty
+   * rules which will allow every url.
    */
   public BaseRobotRules getRobotRules(String url, WebPage page) {
     return RobotRulesParser.EMPTY_RULES;
-  }   
+  }
 }

Modified: 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
 Fri Jan  9 06:34:33 2015
@@ -17,13 +17,16 @@
 
 package org.apache.nutch.protocol.file;
 
-/** Thrown for File error codes.
+/**
+ * Thrown for File error codes.
  */
 public class FileError extends FileException {
 
   private int code;
-  
-  public int getCode(int code) { return code; }
+
+  public int getCode(int code) {
+    return code;
+  }
 
   public FileError(int code) {
     super("File Error: " + code);

Modified: 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Fri Jan  9 06:34:33 2015
@@ -30,31 +30,27 @@ import org.apache.nutch.protocol.Content
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.MimeUtil;
 
-
 /************************************
- * FileResponse.java mimics file replies as http response.
- * It tries its best to follow http's way for headers, response codes
- * as well as exceptions.
- *
- * Comments:
- * (1) java.net.URL and java.net.URLConnection can handle file: scheme.
- * However they are not flexible enough, so not used in this implementation.
- *
- * (2) java.io.File is used for its abstractness across platforms.
- * Warning:
- * java.io.File API (1.4.2) does not elaborate on how special files,
- * such as /dev/* in unix and /proc/* on linux, are treated. Tests show
- *  (a) java.io.File.isFile() return false for /dev/*
- *  (b) java.io.File.isFile() return true for /proc/*
- *  (c) java.io.File.length() return 0 for /proc/*
- * We are probably oaky for now. Could be buggy here.
- * How about special files on windows?
- *
- * (3) java.io.File API (1.4.2) does not seem to know unix hard link files.
- * They are just treated as individual files.
- *
+ * FileResponse.java mimics file replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ * 
+ * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
+ * scheme. However they are not flexible enough, so not used in this
+ * implementation.
+ * 
+ * (2) java.io.File is used for its abstractness across platforms. Warning:
+ * java.io.File API (1.4.2) does not elaborate on how special files, such as
+ * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
+ * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
+ * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We 
are
+ * probably oaky for now. Could be buggy here. How about special files on
+ * windows?
+ * 
+ * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. 
They
+ * are just treated as individual files.
+ * 
  * (4) No funcy POSIX file attributes yet. May never need?
- *
+ * 
  * @author John Xing
  ***********************************/
 public class FileResponse {
@@ -68,33 +64,36 @@ public class FileResponse {
 
   private final File file;
   private Configuration conf;
-  
+
   private MimeUtil MIME;
 
   /** Returns the response code. */
-  public int getCode() { return code; }
+  public int getCode() {
+    return code;
+  }
 
   /** Returns the value of a named header. */
   public String getHeader(String name) {
     return headers.get(name);
   }
 
-  public byte[] getContent() { return content; }
+  public byte[] getContent() {
+    return content;
+  }
 
   public Content toContent() {
     return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
-                       getHeader(Response.CONTENT_TYPE),
-                       headers, this.conf);
+        getHeader(Response.CONTENT_TYPE), headers, this.conf);
   }
-  
+
   public FileResponse(URL url, WebPage page, File file, Configuration conf)
-    throws FileException, IOException {
+      throws FileException, IOException {
 
     this.orig = url.toString();
     this.base = url.toString();
     this.file = file;
     this.conf = conf;
-    
+
     MIME = new MimeUtil(conf);
 
     if (!"file".equals(url.getProtocol()))
@@ -117,22 +116,22 @@ public class FileResponse {
       path = java.net.URLDecoder.decode(path, "UTF-8");
     } catch (UnsupportedEncodingException ex) {
     }
-    
+
     try {
 
       this.content = null;
 
       // url.toURI() is only in j2se 1.5.0
-      //java.io.File f = new java.io.File(url.toURI());
+      // java.io.File f = new java.io.File(url.toURI());
       java.io.File f = new java.io.File(path);
 
       if (!f.exists()) {
-        this.code = 404;  // http Not Found
+        this.code = 404; // http Not Found
         return;
       }
 
       if (!f.canRead()) {
-        this.code = 401;  // http Unauthorized
+        this.code = 401; // http Unauthorized
         return;
       }
 
@@ -141,15 +140,17 @@ public class FileResponse {
       // where case is insensitive
       if (!f.equals(f.getCanonicalFile())) {
         // set headers
-        //hdrs.put("Location", f.getCanonicalFile().toURI());
-        headers.set(Response.LOCATION, 
f.getCanonicalFile().toURI().toURL().toString());
+        // hdrs.put("Location", f.getCanonicalFile().toURI());
+        headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
+            .toString());
 
-        this.code = 300;  // http redirect
+        this.code = 300; // http redirect
         return;
       }
       if (f.lastModified() <= page.getModifiedTime()) {
         this.code = 304;
-        this.headers.set("Last-Modified", 
HttpDateFormat.toString(f.lastModified()));
+        this.headers.set("Last-Modified",
+            HttpDateFormat.toString(f.lastModified()));
         return;
       }
 
@@ -169,45 +170,46 @@ public class FileResponse {
   }
 
   // get file as http response
-  private void getFileAsHttpResponse(java.io.File f)
-    throws FileException, IOException {
+  private void getFileAsHttpResponse(java.io.File f) throws FileException,
+      IOException {
 
     // ignore file of size larger than
     // Integer.MAX_VALUE = 2^31-1 = 2147483647
     long size = f.length();
     if (size > Integer.MAX_VALUE) {
-      throw new FileException("file is too large, size: "+size);
+      throw new FileException("file is too large, size: " + size);
       // or we can do this?
-      // this.code = 400;  // http Bad request
+      // this.code = 400; // http Bad request
       // return;
     }
 
     // capture content
     int len = (int) size;
-    
+
     if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
       len = this.file.maxContentLength;
 
     this.content = new byte[len];
 
     java.io.InputStream is = new java.io.FileInputStream(f);
-    int offset = 0; int n = 0;
+    int offset = 0;
+    int n = 0;
     while (offset < len
-      && (n = is.read(this.content, offset, len-offset)) >= 0) {
+        && (n = is.read(this.content, offset, len - offset)) >= 0) {
       offset += n;
     }
     if (offset < len) { // keep whatever already have, but issue a warning
       if (File.LOG.isWarnEnabled()) {
-        File.LOG.warn("not enough bytes read from file: "+f.getPath());
+        File.LOG.warn("not enough bytes read from file: " + f.getPath());
       }
     }
-    is.close(); 
+    is.close();
 
     // set headers
     headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
-    headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
-        .lastModified()));
-    
+    headers.set(Response.LAST_MODIFIED,
+        HttpDateFormat.toString(f.lastModified()));
+
     String mimeType = MIME.getMimeType(f);
     String mimeTypeString = mimeType != null ? mimeType.toString() : "";
     headers.set(Response.CONTENT_TYPE, mimeTypeString);
@@ -217,33 +219,33 @@ public class FileResponse {
   }
 
   // get dir list as http response
-  private void getDirAsHttpResponse(java.io.File f)
-    throws IOException {
+  private void getDirAsHttpResponse(java.io.File f) throws IOException {
 
     String path = f.toString();
     if (this.file.crawlParents)
-        this.content = list2html(f.listFiles(), path, "/".equals(path) ? false 
: true);
+      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
+          : true);
     else
-        this.content = list2html(f.listFiles(), path, false);
+      this.content = list2html(f.listFiles(), path, false);
 
     // set headers
     headers.set(Response.CONTENT_LENGTH,
-      new Integer(this.content.length).toString());
+        new Integer(this.content.length).toString());
     headers.set(Response.CONTENT_TYPE, "text/html");
     headers.set(Response.LAST_MODIFIED,
-      HttpDateFormat.toString(f.lastModified()));
+        HttpDateFormat.toString(f.lastModified()));
 
     // response code
     this.code = 200; // http OK
   }
 
   // generate html page from dir list
-  private byte[] list2html(java.io.File[] list,
-    String path, boolean includeDotDot) {
+  private byte[] list2html(java.io.File[] list, String path,
+      boolean includeDotDot) {
 
     StringBuffer x = new StringBuffer("<html><head>");
-    x.append("<title>Index of "+path+"</title></head>\n");
-    x.append("<body><h1>Index of "+path+"</h1><pre>\n");
+    x.append("<title>Index of " + path + "</title></head>\n");
+    x.append("<body><h1>Index of " + path + "</h1><pre>\n");
 
     if (includeDotDot) {
       x.append("<a href='../'>../</a>\t-\t-\t-\n");
@@ -252,20 +254,20 @@ public class FileResponse {
     // fix me: we might want to sort list here! but not now.
 
     java.io.File f;
-    for (int i=0; i<list.length; i++) {
+    for (int i = 0; i < list.length; i++) {
       f = list[i];
       String name = f.getName();
       String time = HttpDateFormat.toString(f.lastModified());
       if (f.isDirectory()) {
         // java 1.4.2 api says dir itself and parent dir are not listed
         // so the following is not needed.
-        //if (name.equals(".") || name.equals(".."))
-        //  continue;
-        x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
-        x.append(time+"\t-\n");
+        // if (name.equals(".") || name.equals(".."))
+        // continue;
+        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+        x.append(time + "\t-\n");
       } else if (f.isFile()) {
-        x.append("<a href='"+name+    "'>"+name+"</a>\t");
-        x.append(time+"\t"+f.length()+"\n");
+        x.append("<a href='" + name + "'>" + name + "</a>\t");
+        x.append(time + "\t" + f.length() + "\n");
       } else {
         // ignore any other
       }

Modified: 
nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
 Fri Jan  9 06:34:33 2015
@@ -37,9 +37,10 @@ import static org.junit.Assert.*;
  * @author mattmann
  * @version $Revision$
  * 
- * <p>
- * Unit tests for the {@link File}Protocol.
- * </p>.
+ *          <p>
+ *          Unit tests for the {@link File}Protocol.
+ *          </p>
+ *          .
  */
 public class TestProtocolFile {
 
@@ -47,12 +48,13 @@ public class TestProtocolFile {
   private String sampleDir = System.getProperty("test.data", ".");
 
   private static final String[] testTextFiles = new String[] {
-      "testprotocolfile.txt", "testprotocolfile_(encoded).txt", 
"testprotocolfile_%28encoded%29.txt" };
+      "testprotocolfile.txt", "testprotocolfile_(encoded).txt",
+      "testprotocolfile_%28encoded%29.txt" };
 
   private static final String expectedMimeType = "text/plain";
-  
+
   private Configuration conf;
-  
+
   @Before
   public void setUp() {
     conf = NutchConfiguration.create();
@@ -64,11 +66,11 @@ public class TestProtocolFile {
       setContentType(testTextFile);
     }
   }
-  
+
   /**
-   * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata
-   * field.
-   * @throws ProtocolNotFound 
+   * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata 
field.
+   * 
+   * @throws ProtocolNotFound
    * 
    * @since NUTCH-384
    * 
@@ -78,19 +80,19 @@ public class TestProtocolFile {
     assertNotNull(urlString);
     WebPage datum = WebPage.newBuilder().build();
     Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    ProtocolOutput output = protocol.getProtocolOutput(urlString,datum);
+    ProtocolOutput output = protocol.getProtocolOutput(urlString, datum);
     assertNotNull(output);
 
     assertEquals("Status code: [" + output.getStatus().getCode()
         + "], not equal to: [" + ProtocolStatusCodes.SUCCESS + "]: args: ["
-        + output.getStatus().getArgs() + "]", (Integer) 
ProtocolStatusCodes.SUCCESS, output
-        .getStatus().getCode());
+        + output.getStatus().getArgs() + "]",
+        (Integer) ProtocolStatusCodes.SUCCESS, output.getStatus().getCode());
     assertNotNull(output.getContent());
     assertNotNull(output.getContent().getContentType());
     assertEquals(expectedMimeType, output.getContent().getContentType());
     assertNotNull(output.getContent().getMetadata());
-    assertEquals(expectedMimeType, output.getContent().getMetadata().get(
-        Response.CONTENT_TYPE));
+    assertEquals(expectedMimeType,
+        output.getContent().getMetadata().get(Response.CONTENT_TYPE));
 
   }

svn commit: r1650447 [19/25] - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/api/impl/db/ src/java/org/apache/nutch/api/model/response/ src/java/org/apache/nutch/api/resources/ s...

Reply via email to