http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html 
b/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html
new file mode 100644
index 0000000..5dcf7c2
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html
@@ -0,0 +1,23 @@
+<html>
+<head>
+<title>nested spider trap</title>
+</head>
+
+<body>Nutch fetcher test page
+<table>
+  <tr> 
+    <td>
+<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i>
 
+<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>
 
+<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i
 ></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>
 
</i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i>
 
+<i><b><i><b><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></
 
b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></b></i></b></i>
 
+<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><
 
b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b
 ><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b>
 
<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><
 
i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i
 ><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i>
 
<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><
 
b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b><
 
/i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></
 
b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i
 ></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>
 
</i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i><
 
/b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></
 
i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b
 ></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i>
 
</b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b><
 
/i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i>
 
+</b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>
 
+<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><
 
i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>
 
</i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i>
 
+
+    </td>
+  </tr>
+ 
+</table>
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/pagea.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/pagea.html 
b/nutch-core/src/test/resources/fetch-test-site/pagea.html
new file mode 100644
index 0000000..6444c41
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/pagea.html
@@ -0,0 +1,11 @@
+<html>
+ <head>
+  <title>page a</title>
+ </head>
+<body>
+This is page a
+<a href="index.html">home</a>
+<hr>
+Nutch fetcher test page
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/pageb.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/pageb.html 
b/nutch-core/src/test/resources/fetch-test-site/pageb.html
new file mode 100644
index 0000000..66e3725
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/pageb.html
@@ -0,0 +1,11 @@
+<html>
+ <head>
+  <title>bage b</title>
+ </head>
+<body>
+This is page b
+<a href="index.html">home</a>
+<hr>
+Nutch fetcher test page
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/robots.txt
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/robots.txt 
b/nutch-core/src/test/resources/fetch-test-site/robots.txt
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-mime-util/test.xlsx
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-mime-util/test.xlsx 
b/nutch-core/src/test/resources/test-mime-util/test.xlsx
new file mode 100644
index 0000000..de33f28
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-mime-util/test.xlsx differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc
new file mode 100644
index 0000000..c321777
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc
new file mode 100644
index 0000000..5c5d11f
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data
 
b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data
new file mode 100644
index 0000000..0f8d263
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index
 
b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index
new file mode 100644
index 0000000..4dfeaec
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc
new file mode 100644
index 0000000..c4d315a
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc
new file mode 100644
index 0000000..6dd171e
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data
 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data
new file mode 100644
index 0000000..66b1f8d
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index
 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index
new file mode 100644
index 0000000..ad4ed47
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc
new file mode 100644
index 0000000..8d5ffa4
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000
 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000
new file mode 100644
index 0000000..41ef146
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc
new file mode 100644
index 0000000..683a1dd
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000
 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000
new file mode 100644
index 0000000..3232abf
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc
new file mode 100644
index 0000000..47164ee
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc
new file mode 100644
index 0000000..a32d62d
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data
 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data
new file mode 100644
index 0000000..5b71a24
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index
 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index
new file mode 100644
index 0000000..d931103
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc
new file mode 100644
index 0000000..53c925c
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc
new file mode 100644
index 0000000..5ba878c
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data
 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data
new file mode 100644
index 0000000..b58f97f
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index
 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index
new file mode 100644
index 0000000..9880a27
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc
new file mode 100644
index 0000000..1b49819
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc
new file mode 100644
index 0000000..5aae648
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data
 
b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data
new file mode 100644
index 0000000..8069e84
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index
 
b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index
new file mode 100644
index 0000000..9b19ce9
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc
new file mode 100644
index 0000000..926ced1
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc
new file mode 100644
index 0000000..714a1e8
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data
 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data
new file mode 100644
index 0000000..f36a9fa
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index
 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index
new file mode 100644
index 0000000..c648d89
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc
new file mode 100644
index 0000000..3ee3c94
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000
 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000
new file mode 100644
index 0000000..1ef0406
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc
new file mode 100644
index 0000000..7948825
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000
 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000
new file mode 100644
index 0000000..3a83a82
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc
new file mode 100644
index 0000000..b46b6f6
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc
new file mode 100644
index 0000000..18766e6
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data
 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data
new file mode 100644
index 0000000..9a1f284
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index
 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index
new file mode 100644
index 0000000..47fb983
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc
new file mode 100644
index 0000000..ceada1b
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc
 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc
new file mode 100644
index 0000000..b756b5c
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data
 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data
new file mode 100644
index 0000000..ad96df0
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index
 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index
new file mode 100644
index 0000000..a3e1d8d
Binary files /dev/null and 
b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index
 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/build-plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/build-plugin.xml b/nutch-plugins/build-plugin.xml
new file mode 100755
index 0000000..c759d5f
--- /dev/null
+++ b/nutch-plugins/build-plugin.xml
@@ -0,0 +1,255 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- Imported by plugin build.xml files to define default targets. -->
+<project xmlns:ivy="antlib:org.apache.ivy.ant">
+
+  <property name="name" value="${ant.project.name}"/>
+  <property name="root" value="${basedir}"/>
+
+  <!-- load plugin-specific properties first -->
+  <property file="${user.home}/${name}.build.properties" />
+  <property file="${root}/build.properties" />
+
+  <property name="nutch.root" location="${root}/../../../"/>
+
+  <property name="src.dir" location="${root}/src/java"/>
+  <property name="src.test" location="${root}/src/test"/>
+
+  <available file="${src.test}" type="dir" property="test.available"/>
+
+  <property name="conf.dir" location="${nutch.root}/conf"/>
+
+  <property name="build.dir" location="${nutch.root}/build/${name}"/>
+  <property name="build.classes" location="${build.dir}/classes"/>
+  <property name="build.test" location="${build.dir}/test"/>
+  <property name="build.test.lib" location="${build.test}/lib"/>
+
+  <property name="deploy.dir" location="${nutch.root}/build/plugins/${name}"/>
+
+  <!-- load nutch defaults last so that they can be overridden above -->
+  <property file="${nutch.root}/default.properties" />
+
+  <ivy:settings id="ivy.instance" file="${nutch.root}/ivy/ivysettings.xml" />
+
+  <path id="plugin.deps"/>
+
+  <fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/>
+
+  <!-- the normal classpath -->
+  <path id="classpath">
+    <pathelement location="${build.classes}"/>
+    <fileset refid="lib.jars"/>
+    <pathelement location="${nutch.root}/build/classes"/>
+    <fileset dir="${nutch.root}/build/lib">
+      <include name="*.jar" />
+    </fileset>
+    <path refid="plugin.deps"/>
+    <fileset dir="${deploy.dir}">
+      <include name="*.jar" />
+    </fileset>
+  </path>
+
+  <!-- the unit test classpath -->
+  <path id="test.classpath">
+    <pathelement location="${build.test}" />
+    <pathelement location="${nutch.root}/build/test/classes"/>
+    <pathelement location="${nutch.root}/src/test"/>
+    <pathelement location="${conf.dir}"/>
+    <pathelement location="${nutch.root}/build"/>
+    <!-- test dependencies specific to current plugin -->
+    <fileset dir="${build.test.lib}">
+      <include name="*.jar" />
+    </fileset>
+    <!-- global test dependencies -->
+    <fileset dir="${nutch.root}/build/test/lib">
+      <include name="*.jar" />
+    </fileset>
+    <path refid="classpath"/>
+  </path>
+
+  <!-- ====================================================== -->
+  <!-- Stuff needed by all targets                            -->
+  <!-- ====================================================== -->
+  <target name="init">
+    <mkdir dir="${build.dir}"/>
+    <mkdir dir="${build.classes}"/>
+    <mkdir dir="${build.test}"/>
+    <mkdir dir="${build.test.lib}"/>
+    <mkdir dir="${deploy.dir}"/>
+
+    <antcall target="init-plugin"/>
+  </target>
+
+  <!-- to be overridden by sub-projects --> 
+  <target name="init-plugin"/>
+
+  <!--
+   ! Used to build plugin compilation dependencies
+   ! (to be overridden by plugins)
+   !-->
+  <target name="deps-jar"/>
+
+  <!--
+   ! Used to deploy plugin runtime dependencies
+   ! (to be overridden by plugins)
+   !-->
+  <target name="deps-test"/>
+
+  <!--
+   ! Used to compile test for plugin runtime dependencies
+   ! (to be overridden by plugins)
+   !-->
+  <target name="deps-test-compile"/>
+
+  <!-- ====================================================== -->
+  <!-- Compile the Java files                                 -->
+  <!-- ====================================================== -->
+  <target name="compile" depends="init,deps-jar, resolve-default">
+    <echo message="Compiling plugin: ${name}"/>
+    <javac 
+     encoding="${build.encoding}" 
+     srcdir="${src.dir}"
+     includes="**/*.java"
+     destdir="${build.classes}"
+     debug="${javac.debug}"
+     optimize="${javac.optimize}"
+     target="${javac.version}"
+     source="${javac.version}"
+     deprecation="${javac.deprecation}">
+      <classpath refid="classpath"/>
+    </javac>
+  </target>
+
+  <target name="compile-core">
+    <ant target="compile-core" inheritall="false" dir="${nutch.root}"/>
+    <ant target="compile"/>
+  </target>
+  
+  <!-- ================================================================== -->
+  <!-- Make plugin .jar                                                   -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="jar" depends="compile">
+    <jar
+      jarfile="${build.dir}/${name}.jar"
+      basedir="${build.classes}"
+    />
+  </target>
+
+  <target name="jar-core" depends="compile-core">
+    <jar
+        jarfile="${build.dir}/${name}.jar"
+        basedir="${build.classes}"
+        />
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- Deploy plugin to ${deploy.dir}                                     -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="deploy" depends="jar, deps-test">
+    <mkdir dir="${deploy.dir}"/>
+    <copy file="plugin.xml" todir="${deploy.dir}" 
+          preservelastmodified="true"/>
+    <available property="lib-available"
+                 file="${build.dir}/${name}.jar"/>
+    <antcall target="copy-generated-lib"/>
+    <copy todir="${deploy.dir}" flatten="true">
+      <fileset refid="lib.jars"/>
+    </copy>
+  </target>
+       
+  <target name="copy-generated-lib" if="lib-available">
+    <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" 
failonerror="false"/>
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- Compile test code                                                  --> 
+  <!-- ================================================================== -->
+  <target name="compile-test" depends="compile, deps-test-compile" 
if="test.available">
+    <javac 
+     encoding="${build.encoding}" 
+     srcdir="${src.test}"
+     includes="**/*.java"
+     destdir="${build.test}"
+     debug="${javac.debug}"
+     optimize="${javac.optimize}"
+     target="${javac.version}"
+     source="${javac.version}"
+     deprecation="${javac.deprecation}">
+      <classpath refid="test.classpath"/>
+    </javac>    
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- Run unit tests                                                     --> 
+  <!-- ================================================================== -->
+  <target name="test" depends="compile-test, deploy" if="test.available">
+    <echo message="Testing plugin: ${name}"/>
+
+    <junit printsummary="yes" haltonfailure="no" fork="yes"
+      errorProperty="tests.failed" failureProperty="tests.failed">
+      <sysproperty key="test.data" value="${build.test}/data"/>
+      <sysproperty key="test.input" value="${root}/data"/>
+      <sysproperty key="javax.xml.parsers.DocumentBuilderFactory" 
value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/> 
+      <classpath refid="test.classpath"/>
+      <formatter type="${test.junit.output.format}" />
+      <batchtest todir="${build.test}" unless="testcase">
+        <fileset dir="${src.test}"
+                 includes="**/Test*.java" excludes="**/${test.exclude}.java" />
+      </batchtest>
+      <batchtest todir="${build.test}" if="testcase">
+        <fileset dir="${src.test}" includes="**/${testcase}.java"/>
+      </batchtest>
+    </junit>
+
+    <fail if="tests.failed">Tests failed!</fail>
+
+  </target>   
+
+  <!-- target: resolve  ================================================= -->
+  <target name="resolve-default" depends="clean-lib" description="resolve and 
retrieve dependencies with ivy">
+    <ivy:resolve file="ivy.xml" conf="default" log="download-only"/>
+    <ivy:retrieve pattern="${deploy.dir}/[artifact]-[revision].[ext]" 
symlink="false" log="quiet"/>
+  </target>
+
+  <target name="resolve-test" depends="clean-lib" description="resolve and 
retrieve dependencies with ivy">
+    <ivy:resolve file="ivy.xml" conf="test" log="download-only"/>
+    <ivy:retrieve pattern="${build.test.lib}/[artifact]-[revision].[ext]" 
symlink="false" log="quiet"/>
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- Clean.  Delete the build files, and their directories              -->
+  <!-- ================================================================== -->
+  <!-- target: clean  =================================================== -->
+  <target name="clean" depends="clean-build, clean-lib" description="--> clean 
the project" />
+
+  <!-- target: clean-lib  =============================================== -->
+  <target name="clean-lib" description="--> clean the project libraries 
directory (dependencies)">
+    <delete includeemptydirs="true" dir="${build.lib.dir}"/>
+  </target>
+
+  <!-- target: clean-build  ============================================= -->
+  <target name="clean-build" description="--> clean the project built files">
+    <delete includeemptydirs="true" dir="${build.dir}"/>
+    <delete includeemptydirs="true" dir="${deploy.dir}"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/build.xml b/nutch-plugins/build.xml
new file mode 100755
index 0000000..75ae2e7
--- /dev/null
+++ b/nutch-plugins/build.xml
@@ -0,0 +1,213 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="Nutch" default="deploy-core" basedir=".">
+
+  <target name="deploy-core">
+    <ant target="compile-core" inheritall="false" dir="../.."/>
+    <ant target="deploy"/>
+  </target>
+
+  <!-- ====================================================== -->
+  <!-- Build & deploy all the plugin jars.                    -->
+  <!-- ====================================================== -->
+  <target name="deploy">
+     <ant dir="creativecommons" target="deploy"/>
+     <ant dir="feed" target="deploy"/>
+     <ant dir="headings" target="deploy"/>
+     <ant dir="index-basic" target="deploy"/>
+     <ant dir="index-anchor" target="deploy"/>
+     <ant dir="index-geoip" target="deploy"/>
+     <ant dir="index-more" target="deploy"/>
+     <ant dir="index-replace" target="deploy"/>
+     <ant dir="index-static" target="deploy"/>
+     <ant dir="index-metadata" target="deploy"/>
+     <ant dir="index-links" target="deploy"/>
+     <ant dir="mimetype-filter" target="deploy"/>
+     <ant dir="indexer-cloudsearch" target="deploy"/>
+     <ant dir="indexer-dummy" target="deploy"/>
+     <ant dir="indexer-elastic" target="deploy"/>
+     <ant dir="indexer-solr" target="deploy"/>
+     <ant dir="language-identifier" target="deploy"/>
+     <ant dir="lib-http" target="deploy"/>
+     <ant dir="lib-nekohtml" target="deploy"/>
+     <ant dir="lib-regex-filter" target="deploy"/>
+     <ant dir="lib-xml" target="deploy"/>
+     <ant dir="microformats-reltag" target="deploy"/>
+     <ant dir="nutch-extensionpoints" target="deploy"/>
+     <ant dir="protocol-file" target="deploy"/>
+     <ant dir="protocol-ftp" target="deploy"/>
+     <ant dir="protocol-http" target="deploy"/>
+     <ant dir="protocol-httpclient" target="deploy"/>
+     <ant dir="lib-htmlunit" target="deploy"/>
+     <ant dir="protocol-htmlunit" target="deploy" />
+     <ant dir="lib-selenium" target="deploy"/>
+     <ant dir="protocol-selenium" target="deploy" />
+     <ant dir="protocol-interactiveselenium" target="deploy" />
+     <ant dir="parse-ext" target="deploy"/>
+     <ant dir="parse-js" target="deploy"/>
+     <ant dir="parse-html" target="deploy"/>
+     <ant dir="parse-metatags" target="deploy"/>
+     <ant dir="parse-swf" target="deploy"/>
+     <ant dir="parse-tika" target="deploy"/>
+     <ant dir="parse-zip" target="deploy"/>
+     <ant dir="scoring-depth" target="deploy"/>
+     <ant dir="scoring-opic" target="deploy"/>
+     <ant dir="scoring-link" target="deploy"/>
+     <ant dir="scoring-similarity" target="deploy"/>
+     <ant dir="subcollection" target="deploy"/>
+     <ant dir="tld" target="deploy"/>
+     <ant dir="urlfilter-automaton" target="deploy"/>
+     <ant dir="urlfilter-domain" target="deploy" />
+     <ant dir="urlfilter-domainblacklist" target="deploy" />
+     <ant dir="urlfilter-prefix" target="deploy"/>
+     <ant dir="urlfilter-regex" target="deploy"/>
+     <ant dir="urlfilter-suffix" target="deploy"/>
+     <ant dir="urlfilter-validator" target="deploy"/>
+     <ant dir="urlfilter-ignoreexempt" target="deploy"/>
+     <ant dir="parsefilter-naivebayes" target="deploy"/>
+     <ant dir="parsefilter-regex" target="deploy"/>
+     <ant dir="urlmeta" target="deploy"/>
+     <ant dir="urlnormalizer-ajax" target="deploy"/>
+     <ant dir="urlnormalizer-basic" target="deploy"/>
+     <ant dir="urlnormalizer-host" target="deploy"/>
+     <ant dir="urlnormalizer-pass" target="deploy"/>
+     <ant dir="urlnormalizer-protocol" target="deploy"/>
+     <ant dir="urlnormalizer-querystring" target="deploy"/>
+     <ant dir="urlnormalizer-regex" target="deploy"/>
+     <ant dir="urlnormalizer-slash" target="deploy"/>
+  </target>
+
+  <!-- ====================================================== -->
+  <!-- Test all of the plugins.                               -->
+  <!-- ====================================================== -->
+  <target name="test">
+    <parallel threadCount="2">
+     <ant dir="creativecommons" target="test"/>
+     <ant dir="index-basic" target="test"/>
+     <ant dir="index-anchor" target="test"/>
+     <ant dir="index-geoip" target="test"/>
+     <ant dir="index-more" target="test"/>
+     <ant dir="index-static" target="test"/>
+     <ant dir="index-replace" target="test"/>
+     <ant dir="index-links" target="test"/>
+     <ant dir="mimetype-filter" target="test"/>
+     <ant dir="language-identifier" target="test"/>
+     <ant dir="lib-http" target="test"/>
+     <ant dir="protocol-file" target="test"/>
+     <ant dir="protocol-http" target="test"/>
+     <ant dir="protocol-httpclient" target="test"/>
+     <!--ant dir="parse-ext" target="test"/-->
+     <ant dir="feed" target="test"/>
+     <ant dir="parse-html" target="test"/>
+     <ant dir="parse-metatags" target="test"/>
+     <ant dir="parse-swf" target="test"/>
+     <ant dir="parse-tika" target="test"/>
+     <ant dir="parse-zip" target="test"/>
+     <ant dir="parsefilter-regex" target="test"/>
+     <ant dir="subcollection" target="test"/>
+     <ant dir="urlfilter-automaton" target="test"/>
+     <ant dir="urlfilter-domain" target="test"/>
+     <ant dir="urlfilter-domainblacklist" target="test"/>
+     <ant dir="urlfilter-prefix" target="test"/>
+     <ant dir="urlfilter-regex" target="test"/>
+     <ant dir="urlfilter-suffix" target="test"/>
+     <ant dir="urlfilter-validator" target="test"/>
+     <ant dir="urlfilter-ignoreexempt" target="test"/>
+     <ant dir="urlnormalizer-ajax" target="test"/>
+     <ant dir="urlnormalizer-basic" target="test"/>
+     <ant dir="urlnormalizer-host" target="test"/>
+     <ant dir="urlnormalizer-pass" target="test"/>
+     <ant dir="urlnormalizer-protocol" target="test"/>
+     <ant dir="urlnormalizer-querystring" target="test"/>
+     <ant dir="urlnormalizer-regex" target="test"/>
+     <ant dir="urlnormalizer-slash" target="test"/>
+    </parallel>
+  </target>
+
+  <!-- ====================================================== -->
+  <!-- Clean all of the plugins.                              -->
+  <!-- ====================================================== -->
+  <target name="clean">
+    <ant dir="creativecommons" target="clean"/>
+    <ant dir="feed" target="clean"/>
+    <ant dir="headings" target="clean"/>
+    <ant dir="index-basic" target="clean"/>
+    <ant dir="index-anchor" target="clean"/>
+    <ant dir="index-geoip" target="clean"/>
+    <ant dir="index-more" target="clean"/>
+    <ant dir="index-static" target="clean"/>
+    <ant dir="index-replace" target="clean"/>
+    <ant dir="index-metadata" target="clean"/>
+    <ant dir="index-links" target="clean"/>
+    <ant dir="mimetype-filter" target="clean"/>
+    <ant dir="indexer-cloudsearch" target="clean"/>
+    <ant dir="indexer-dummy" target="clean"/>
+    <ant dir="indexer-elastic" target="clean"/>
+    <ant dir="indexer-solr" target="clean"/>
+    <ant dir="language-identifier" target="clean"/>
+    <!-- <ant dir="lib-commons-httpclient" target="clean"/> -->
+    <ant dir="lib-http" target="clean"/>
+    <!-- <ant dir="lib-lucene-analyzers" target="clean"/>-->
+    <ant dir="lib-nekohtml" target="clean"/>
+    <ant dir="lib-regex-filter" target="clean"/>
+    <ant dir="lib-xml" target="clean"/>
+    <ant dir="microformats-reltag" target="clean"/>
+    <ant dir="nutch-extensionpoints" target="clean"/>
+    <ant dir="protocol-file" target="clean"/>
+    <ant dir="protocol-ftp" target="clean"/>
+    <ant dir="protocol-http" target="clean"/>
+    <ant dir="protocol-httpclient" target="clean"/>
+    <ant dir="lib-htmlunit" target="clean"/>
+    <ant dir="protocol-htmlunit" target="clean" />
+    <ant dir="lib-selenium" target="clean"/>
+    <ant dir="protocol-selenium" target="clean" />
+    <ant dir="protocol-interactiveselenium" target="clean" />
+    <ant dir="parse-ext" target="clean"/>
+    <ant dir="parse-js" target="clean"/>
+    <ant dir="parse-html" target="clean"/>
+    <ant dir="parse-metatags" target="clean"/>
+    <ant dir="parse-swf" target="clean"/>
+    <ant dir="parse-tika" target="clean"/>
+    <ant dir="parse-zip" target="clean"/>
+    <ant dir="parsefilter-regex" target="clean"/>
+    <ant dir="scoring-depth" target="clean"/>
+    <ant dir="scoring-opic" target="clean"/>
+    <ant dir="scoring-link" target="clean"/>
+    <ant dir="scoring-similarity" target="clean"/>
+    <ant dir="subcollection" target="clean"/>
+    <ant dir="tld" target="clean"/>
+    <ant dir="urlfilter-automaton" target="clean"/>
+    <ant dir="urlfilter-domain" target="clean" />
+    <ant dir="urlfilter-domainblacklist" target="clean" />
+    <ant dir="urlfilter-prefix" target="clean"/>
+    <ant dir="urlfilter-regex" target="clean"/>
+    <ant dir="urlfilter-suffix" target="clean"/>
+    <ant dir="urlfilter-validator" target="clean"/>
+    <ant dir="urlfilter-ignoreexempt" target="clean"/>
+    <ant dir="parsefilter-naivebayes" target="clean" />
+    <ant dir="urlmeta" target="clean"/>
+    <ant dir="urlnormalizer-ajax" target="clean"/>
+    <ant dir="urlnormalizer-basic" target="clean"/>
+    <ant dir="urlnormalizer-host" target="clean"/>
+    <ant dir="urlnormalizer-pass" target="clean"/>
+    <ant dir="urlnormalizer-protocol" target="clean"/>
+    <ant dir="urlnormalizer-querystring" target="clean"/>
+    <ant dir="urlnormalizer-regex" target="clean"/>
+    <ant dir="urlnormalizer-slash" target="clean"/>
+  </target>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/README.txt 
b/nutch-plugins/creativecommons/README.txt
new file mode 100644
index 0000000..d4d7b65
--- /dev/null
+++ b/nutch-plugins/creativecommons/README.txt
@@ -0,0 +1 @@
+Support for crawling and searching Creative-Commons licensed content. 

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/build.xml 
b/nutch-plugins/creativecommons/build.xml
new file mode 100755
index 0000000..6443d7f
--- /dev/null
+++ b/nutch-plugins/creativecommons/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="creativecommons" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+   <!--  <ant target="deploy" inheritall="false" dir="../parse-html"/> -->
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt 
b/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt
new file mode 100644
index 0000000..324617f
--- /dev/null
+++ b/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt
@@ -0,0 +1,18 @@
+# Creative Commnons crawl filter
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto|https):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|rtf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|mp3|rss|xml|doc|pdf|txt|DOC|PDF|TXT)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# accept anything else
++.

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/conf/nutch-site.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/conf/nutch-site.xml 
b/nutch-plugins/creativecommons/conf/nutch-site.xml
new file mode 100644
index 0000000..71e344b
--- /dev/null
+++ b/nutch-plugins/creativecommons/conf/nutch-site.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+
+<!-- Creative Commons' Nutch configuration -->
+
+<nutch-conf>
+
+<property>
+  <name>http.agent.name</name>
+  <value>CreativeCommons</value>
+  <description>Our HTTP 'User-Agent' request header.</description>
+</property>
+
+<property>
+  <name>http.robots.agents</name>
+  <value>CreativeCommons,Nutch,*</value>
+  <description>The agent strings we'll look for in robots.txt files,
+  comma-separated, in decreasing order of precedence.</description>
+</property>
+
+<property>
+  <name>fetcher.server.delay</name>
+  <value>2.0</value>
+  <description>We need to be more polite than when crawling an
+  intranet that we control.</description>
+</property>
+
+<property>
+  <name>http.max.delays</name>
+  <value>3</value>
+  <description>The CC crawl visits a large number of different
+  hosts, so we should not need to delay much.</description>
+</property>
+
+<property>
+  <name>creativecommons.exclude.unlicensed</name>
+  <value>true</value>
+  <description>Exclude HTML content which does not contain a CC license.
+  </description>
+</property>
+
+<property>
+  <name>plugin.excludes</name>
+  <value>parse-(?!html).*</value>
+  <description>Exclude non-HTML content, since we don't know how to
+  find a CC license in anything but HTML. 
+  </description>
+</property>
+
+</nutch-conf>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/data/anchor.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/data/anchor.html 
b/nutch-plugins/creativecommons/data/anchor.html
new file mode 100755
index 0000000..90b5227
--- /dev/null
+++ b/nutch-plugins/creativecommons/data/anchor.html
@@ -0,0 +1,9 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
"http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd";>
+<html>
+<head>
+</head>
+<body>
+<p><a href="http://creativecommons.org/licenses/by-nc-sa/1.0";><img 
alt="Creative Commons License" 
src="http://creativecommons.org/images/public/somerights.gif"; 
align="right"></a>This file is licensed under a
+<a href="http://creativecommons.org/licenses/by-nc-sa/1.0";>Creative Commons 
License</a>.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/data/rdf.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/data/rdf.html 
b/nutch-plugins/creativecommons/data/rdf.html
new file mode 100755
index 0000000..fb2c34d
--- /dev/null
+++ b/nutch-plugins/creativecommons/data/rdf.html
@@ -0,0 +1,35 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+ <head>
+ </head>
+ <body>
+
+<!-- Creative Commons License -->
+<p><a href="http://creativecommons.org/licenses/by-nc/1.0";><img alt="Creative 
Commons License" border="0" 
src="http://creativecommons.org/images/public/somerights.gif"; /></a><br />
+This work is licensed under a
+<a href="http://creativecommons.org/licenses/by-nc/1.0";>Creative Commons 
License</a>.
+<!--  end Creative Commons License -->
+
+  <!--
+<rdf:RDF xmlns="http://web.resource.org/cc/";
+    xmlns:dc="http://purl.org/dc/elements/1.1/";
+    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";>
+<Work rdf:about="http://boingboing.net";>
+   <dc:type rdf:resource="http://purl.org/dc/dcmitype/Text"; />
+   <license rdf:resource="http://creativecommons.org/licenses/by-nc/1.0"; />
+</Work>
+
+<License rdf:about="http://creativecommons.org/licenses/by-nc/1.0";>
+   <requires rdf:resource="http://web.resource.org/cc/Attribution"; />
+   <permits rdf:resource="http://web.resource.org/cc/DerivativeWorks"; />
+   <permits rdf:resource="http://web.resource.org/cc/Reproduction"; />
+   <permits rdf:resource="http://web.resource.org/cc/Distribution"; />
+   <prohibits rdf:resource="http://web.resource.org/cc/CommercialUse"; />
+   <requires rdf:resource="http://web.resource.org/cc/Notice"; />
+</License>
+
+</rdf:RDF>
+
+-->
+ </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/data/rel.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/data/rel.html 
b/nutch-plugins/creativecommons/data/rel.html
new file mode 100755
index 0000000..413d52f
--- /dev/null
+++ b/nutch-plugins/creativecommons/data/rel.html
@@ -0,0 +1,6 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml"; lang="en"><head>
+</head><body>
+<a rel="license" href="http://creativecommons.org/licenses/by-nc/2.0";>CC 
by-nc</a> 
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/ivy.xml 
b/nutch-plugins/creativecommons/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/creativecommons/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/plugin.xml 
b/nutch-plugins/creativecommons/plugin.xml
new file mode 100755
index 0000000..de9cf36
--- /dev/null
+++ b/nutch-plugins/creativecommons/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="creativecommons"
+   name="Creative Commons Plugins"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="creativecommons.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.creativecommons.nutch.CCParseFilter"
+              name="Creative Commons Metadata Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="CCParseFilter"
+                      class="org.creativecommons.nutch.CCParseFilter"/>
+   </extension>
+
+   <extension id="org.creativecommons.nutch.CCIndexingFilter"
+              name="Creative Commons Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="CCIndexingFilter"
+                      class="org.creativecommons.nutch.CCIndexingFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/pom.xml 
b/nutch-plugins/creativecommons/pom.xml
new file mode 100644
index 0000000..7eb7564
--- /dev/null
+++ b/nutch-plugins/creativecommons/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>creativecommons</artifactId>
+    <packaging>jar</packaging>
+
+    <name>creativecommons</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

Reply via email to