http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html b/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html new file mode 100644 index 0000000..5dcf7c2 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html @@ -0,0 +1,23 @@ +<html> +<head> +<title>nested spider trap</title> +</head> + +<body>Nutch fetcher test page +<table> + <tr> + <td> +<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> +<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> +<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i ></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> +<i><b><i><b><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></ b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></b></i></b></i> +<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i>< b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b ><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b> <i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b>< i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i ><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i> <b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i>< b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>< /i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></ b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i ></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i>< /b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></ i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b ></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> </b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>< /i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> +</b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> +<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b>< i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> + + </td> + </tr> + +</table> +</body> +</html> \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/pagea.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/pagea.html b/nutch-core/src/test/resources/fetch-test-site/pagea.html new file mode 100644 index 0000000..6444c41 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/pagea.html @@ -0,0 +1,11 @@ +<html> + <head> + <title>page a</title> + </head> +<body> +This is page a +<a href="index.html">home</a> +<hr> +Nutch fetcher test page +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/pageb.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/pageb.html b/nutch-core/src/test/resources/fetch-test-site/pageb.html new file mode 100644 index 0000000..66e3725 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/pageb.html @@ -0,0 +1,11 @@ +<html> + <head> + <title>bage b</title> + </head> +<body> +This is page b +<a href="index.html">home</a> +<hr> +Nutch fetcher test page +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/robots.txt ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/robots.txt b/nutch-core/src/test/resources/fetch-test-site/robots.txt new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-mime-util/test.xlsx ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-mime-util/test.xlsx b/nutch-core/src/test/resources/test-mime-util/test.xlsx new file mode 100644 index 0000000..de33f28 Binary files /dev/null and b/nutch-core/src/test/resources/test-mime-util/test.xlsx differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc new file mode 100644 index 0000000..c321777 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc new file mode 100644 index 0000000..5c5d11f Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data new file mode 100644 index 0000000..0f8d263 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index new file mode 100644 index 0000000..4dfeaec Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc new file mode 100644 index 0000000..c4d315a Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc new file mode 100644 index 0000000..6dd171e Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data new file mode 100644 index 0000000..66b1f8d Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index new file mode 100644 index 0000000..ad4ed47 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc new file mode 100644 index 0000000..8d5ffa4 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 new file mode 100644 index 0000000..41ef146 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc new file mode 100644 index 0000000..683a1dd Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 new file mode 100644 index 0000000..3232abf Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc new file mode 100644 index 0000000..47164ee Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc new file mode 100644 index 0000000..a32d62d Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data new file mode 100644 index 0000000..5b71a24 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index new file mode 100644 index 0000000..d931103 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc new file mode 100644 index 0000000..53c925c Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc new file mode 100644 index 0000000..5ba878c Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data new file mode 100644 index 0000000..b58f97f Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index new file mode 100644 index 0000000..9880a27 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc new file mode 100644 index 0000000..1b49819 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc new file mode 100644 index 0000000..5aae648 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data new file mode 100644 index 0000000..8069e84 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index new file mode 100644 index 0000000..9b19ce9 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc new file mode 100644 index 0000000..926ced1 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc new file mode 100644 index 0000000..714a1e8 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data new file mode 100644 index 0000000..f36a9fa Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index new file mode 100644 index 0000000..c648d89 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc new file mode 100644 index 0000000..3ee3c94 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 new file mode 100644 index 0000000..1ef0406 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc new file mode 100644 index 0000000..7948825 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 new file mode 100644 index 0000000..3a83a82 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc new file mode 100644 index 0000000..b46b6f6 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc new file mode 100644 index 0000000..18766e6 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data new file mode 100644 index 0000000..9a1f284 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index new file mode 100644 index 0000000..47fb983 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc new file mode 100644 index 0000000..ceada1b Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc new file mode 100644 index 0000000..b756b5c Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data new file mode 100644 index 0000000..ad96df0 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index new file mode 100644 index 0000000..a3e1d8d Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/build-plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/build-plugin.xml b/nutch-plugins/build-plugin.xml new file mode 100755 index 0000000..c759d5f --- /dev/null +++ b/nutch-plugins/build-plugin.xml @@ -0,0 +1,255 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- Imported by plugin build.xml files to define default targets. --> +<project xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="name" value="${ant.project.name}"/> + <property name="root" value="${basedir}"/> + + <!-- load plugin-specific properties first --> + <property file="${user.home}/${name}.build.properties" /> + <property file="${root}/build.properties" /> + + <property name="nutch.root" location="${root}/../../../"/> + + <property name="src.dir" location="${root}/src/java"/> + <property name="src.test" location="${root}/src/test"/> + + <available file="${src.test}" type="dir" property="test.available"/> + + <property name="conf.dir" location="${nutch.root}/conf"/> + + <property name="build.dir" location="${nutch.root}/build/${name}"/> + <property name="build.classes" location="${build.dir}/classes"/> + <property name="build.test" location="${build.dir}/test"/> + <property name="build.test.lib" location="${build.test}/lib"/> + + <property name="deploy.dir" location="${nutch.root}/build/plugins/${name}"/> + + <!-- load nutch defaults last so that they can be overridden above --> + <property file="${nutch.root}/default.properties" /> + + <ivy:settings id="ivy.instance" file="${nutch.root}/ivy/ivysettings.xml" /> + + <path id="plugin.deps"/> + + <fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/> + + <!-- the normal classpath --> + <path id="classpath"> + <pathelement location="${build.classes}"/> + <fileset refid="lib.jars"/> + <pathelement location="${nutch.root}/build/classes"/> + <fileset dir="${nutch.root}/build/lib"> + <include name="*.jar" /> + </fileset> + <path refid="plugin.deps"/> + <fileset dir="${deploy.dir}"> + <include name="*.jar" /> + </fileset> + </path> + + <!-- the unit test classpath --> + <path id="test.classpath"> + <pathelement location="${build.test}" /> + <pathelement location="${nutch.root}/build/test/classes"/> + <pathelement location="${nutch.root}/src/test"/> + <pathelement location="${conf.dir}"/> + <pathelement location="${nutch.root}/build"/> + <!-- test dependencies specific to current plugin --> + <fileset dir="${build.test.lib}"> + <include name="*.jar" /> + </fileset> + <!-- global test dependencies --> + <fileset dir="${nutch.root}/build/test/lib"> + <include name="*.jar" /> + </fileset> + <path refid="classpath"/> + </path> + + <!-- ====================================================== --> + <!-- Stuff needed by all targets --> + <!-- ====================================================== --> + <target name="init"> + <mkdir dir="${build.dir}"/> + <mkdir dir="${build.classes}"/> + <mkdir dir="${build.test}"/> + <mkdir dir="${build.test.lib}"/> + <mkdir dir="${deploy.dir}"/> + + <antcall target="init-plugin"/> + </target> + + <!-- to be overridden by sub-projects --> + <target name="init-plugin"/> + + <!-- + ! Used to build plugin compilation dependencies + ! (to be overridden by plugins) + !--> + <target name="deps-jar"/> + + <!-- + ! Used to deploy plugin runtime dependencies + ! (to be overridden by plugins) + !--> + <target name="deps-test"/> + + <!-- + ! Used to compile test for plugin runtime dependencies + ! (to be overridden by plugins) + !--> + <target name="deps-test-compile"/> + + <!-- ====================================================== --> + <!-- Compile the Java files --> + <!-- ====================================================== --> + <target name="compile" depends="init,deps-jar, resolve-default"> + <echo message="Compiling plugin: ${name}"/> + <javac + encoding="${build.encoding}" + srcdir="${src.dir}" + includes="**/*.java" + destdir="${build.classes}" + debug="${javac.debug}" + optimize="${javac.optimize}" + target="${javac.version}" + source="${javac.version}" + deprecation="${javac.deprecation}"> + <classpath refid="classpath"/> + </javac> + </target> + + <target name="compile-core"> + <ant target="compile-core" inheritall="false" dir="${nutch.root}"/> + <ant target="compile"/> + </target> + + <!-- ================================================================== --> + <!-- Make plugin .jar --> + <!-- ================================================================== --> + <!-- --> + <!-- ================================================================== --> + <target name="jar" depends="compile"> + <jar + jarfile="${build.dir}/${name}.jar" + basedir="${build.classes}" + /> + </target> + + <target name="jar-core" depends="compile-core"> + <jar + jarfile="${build.dir}/${name}.jar" + basedir="${build.classes}" + /> + </target> + + <!-- ================================================================== --> + <!-- Deploy plugin to ${deploy.dir} --> + <!-- ================================================================== --> + <!-- --> + <!-- ================================================================== --> + <target name="deploy" depends="jar, deps-test"> + <mkdir dir="${deploy.dir}"/> + <copy file="plugin.xml" todir="${deploy.dir}" + preservelastmodified="true"/> + <available property="lib-available" + file="${build.dir}/${name}.jar"/> + <antcall target="copy-generated-lib"/> + <copy todir="${deploy.dir}" flatten="true"> + <fileset refid="lib.jars"/> + </copy> + </target> + + <target name="copy-generated-lib" if="lib-available"> + <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" failonerror="false"/> + </target> + + <!-- ================================================================== --> + <!-- Compile test code --> + <!-- ================================================================== --> + <target name="compile-test" depends="compile, deps-test-compile" if="test.available"> + <javac + encoding="${build.encoding}" + srcdir="${src.test}" + includes="**/*.java" + destdir="${build.test}" + debug="${javac.debug}" + optimize="${javac.optimize}" + target="${javac.version}" + source="${javac.version}" + deprecation="${javac.deprecation}"> + <classpath refid="test.classpath"/> + </javac> + </target> + + <!-- ================================================================== --> + <!-- Run unit tests --> + <!-- ================================================================== --> + <target name="test" depends="compile-test, deploy" if="test.available"> + <echo message="Testing plugin: ${name}"/> + + <junit printsummary="yes" haltonfailure="no" fork="yes" + errorProperty="tests.failed" failureProperty="tests.failed"> + <sysproperty key="test.data" value="${build.test}/data"/> + <sysproperty key="test.input" value="${root}/data"/> + <sysproperty key="javax.xml.parsers.DocumentBuilderFactory" value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/> + <classpath refid="test.classpath"/> + <formatter type="${test.junit.output.format}" /> + <batchtest todir="${build.test}" unless="testcase"> + <fileset dir="${src.test}" + includes="**/Test*.java" excludes="**/${test.exclude}.java" /> + </batchtest> + <batchtest todir="${build.test}" if="testcase"> + <fileset dir="${src.test}" includes="**/${testcase}.java"/> + </batchtest> + </junit> + + <fail if="tests.failed">Tests failed!</fail> + + </target> + + <!-- target: resolve ================================================= --> + <target name="resolve-default" depends="clean-lib" description="resolve and retrieve dependencies with ivy"> + <ivy:resolve file="ivy.xml" conf="default" log="download-only"/> + <ivy:retrieve pattern="${deploy.dir}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/> + </target> + + <target name="resolve-test" depends="clean-lib" description="resolve and retrieve dependencies with ivy"> + <ivy:resolve file="ivy.xml" conf="test" log="download-only"/> + <ivy:retrieve pattern="${build.test.lib}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/> + </target> + + <!-- ================================================================== --> + <!-- Clean. Delete the build files, and their directories --> + <!-- ================================================================== --> + <!-- target: clean =================================================== --> + <target name="clean" depends="clean-build, clean-lib" description="--> clean the project" /> + + <!-- target: clean-lib =============================================== --> + <target name="clean-lib" description="--> clean the project libraries directory (dependencies)"> + <delete includeemptydirs="true" dir="${build.lib.dir}"/> + </target> + + <!-- target: clean-build ============================================= --> + <target name="clean-build" description="--> clean the project built files"> + <delete includeemptydirs="true" dir="${build.dir}"/> + <delete includeemptydirs="true" dir="${deploy.dir}"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/build.xml b/nutch-plugins/build.xml new file mode 100755 index 0000000..75ae2e7 --- /dev/null +++ b/nutch-plugins/build.xml @@ -0,0 +1,213 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="Nutch" default="deploy-core" basedir="."> + + <target name="deploy-core"> + <ant target="compile-core" inheritall="false" dir="../.."/> + <ant target="deploy"/> + </target> + + <!-- ====================================================== --> + <!-- Build & deploy all the plugin jars. --> + <!-- ====================================================== --> + <target name="deploy"> + <ant dir="creativecommons" target="deploy"/> + <ant dir="feed" target="deploy"/> + <ant dir="headings" target="deploy"/> + <ant dir="index-basic" target="deploy"/> + <ant dir="index-anchor" target="deploy"/> + <ant dir="index-geoip" target="deploy"/> + <ant dir="index-more" target="deploy"/> + <ant dir="index-replace" target="deploy"/> + <ant dir="index-static" target="deploy"/> + <ant dir="index-metadata" target="deploy"/> + <ant dir="index-links" target="deploy"/> + <ant dir="mimetype-filter" target="deploy"/> + <ant dir="indexer-cloudsearch" target="deploy"/> + <ant dir="indexer-dummy" target="deploy"/> + <ant dir="indexer-elastic" target="deploy"/> + <ant dir="indexer-solr" target="deploy"/> + <ant dir="language-identifier" target="deploy"/> + <ant dir="lib-http" target="deploy"/> + <ant dir="lib-nekohtml" target="deploy"/> + <ant dir="lib-regex-filter" target="deploy"/> + <ant dir="lib-xml" target="deploy"/> + <ant dir="microformats-reltag" target="deploy"/> + <ant dir="nutch-extensionpoints" target="deploy"/> + <ant dir="protocol-file" target="deploy"/> + <ant dir="protocol-ftp" target="deploy"/> + <ant dir="protocol-http" target="deploy"/> + <ant dir="protocol-httpclient" target="deploy"/> + <ant dir="lib-htmlunit" target="deploy"/> + <ant dir="protocol-htmlunit" target="deploy" /> + <ant dir="lib-selenium" target="deploy"/> + <ant dir="protocol-selenium" target="deploy" /> + <ant dir="protocol-interactiveselenium" target="deploy" /> + <ant dir="parse-ext" target="deploy"/> + <ant dir="parse-js" target="deploy"/> + <ant dir="parse-html" target="deploy"/> + <ant dir="parse-metatags" target="deploy"/> + <ant dir="parse-swf" target="deploy"/> + <ant dir="parse-tika" target="deploy"/> + <ant dir="parse-zip" target="deploy"/> + <ant dir="scoring-depth" target="deploy"/> + <ant dir="scoring-opic" target="deploy"/> + <ant dir="scoring-link" target="deploy"/> + <ant dir="scoring-similarity" target="deploy"/> + <ant dir="subcollection" target="deploy"/> + <ant dir="tld" target="deploy"/> + <ant dir="urlfilter-automaton" target="deploy"/> + <ant dir="urlfilter-domain" target="deploy" /> + <ant dir="urlfilter-domainblacklist" target="deploy" /> + <ant dir="urlfilter-prefix" target="deploy"/> + <ant dir="urlfilter-regex" target="deploy"/> + <ant dir="urlfilter-suffix" target="deploy"/> + <ant dir="urlfilter-validator" target="deploy"/> + <ant dir="urlfilter-ignoreexempt" target="deploy"/> + <ant dir="parsefilter-naivebayes" target="deploy"/> + <ant dir="parsefilter-regex" target="deploy"/> + <ant dir="urlmeta" target="deploy"/> + <ant dir="urlnormalizer-ajax" target="deploy"/> + <ant dir="urlnormalizer-basic" target="deploy"/> + <ant dir="urlnormalizer-host" target="deploy"/> + <ant dir="urlnormalizer-pass" target="deploy"/> + <ant dir="urlnormalizer-protocol" target="deploy"/> + <ant dir="urlnormalizer-querystring" target="deploy"/> + <ant dir="urlnormalizer-regex" target="deploy"/> + <ant dir="urlnormalizer-slash" target="deploy"/> + </target> + + <!-- ====================================================== --> + <!-- Test all of the plugins. --> + <!-- ====================================================== --> + <target name="test"> + <parallel threadCount="2"> + <ant dir="creativecommons" target="test"/> + <ant dir="index-basic" target="test"/> + <ant dir="index-anchor" target="test"/> + <ant dir="index-geoip" target="test"/> + <ant dir="index-more" target="test"/> + <ant dir="index-static" target="test"/> + <ant dir="index-replace" target="test"/> + <ant dir="index-links" target="test"/> + <ant dir="mimetype-filter" target="test"/> + <ant dir="language-identifier" target="test"/> + <ant dir="lib-http" target="test"/> + <ant dir="protocol-file" target="test"/> + <ant dir="protocol-http" target="test"/> + <ant dir="protocol-httpclient" target="test"/> + <!--ant dir="parse-ext" target="test"/--> + <ant dir="feed" target="test"/> + <ant dir="parse-html" target="test"/> + <ant dir="parse-metatags" target="test"/> + <ant dir="parse-swf" target="test"/> + <ant dir="parse-tika" target="test"/> + <ant dir="parse-zip" target="test"/> + <ant dir="parsefilter-regex" target="test"/> + <ant dir="subcollection" target="test"/> + <ant dir="urlfilter-automaton" target="test"/> + <ant dir="urlfilter-domain" target="test"/> + <ant dir="urlfilter-domainblacklist" target="test"/> + <ant dir="urlfilter-prefix" target="test"/> + <ant dir="urlfilter-regex" target="test"/> + <ant dir="urlfilter-suffix" target="test"/> + <ant dir="urlfilter-validator" target="test"/> + <ant dir="urlfilter-ignoreexempt" target="test"/> + <ant dir="urlnormalizer-ajax" target="test"/> + <ant dir="urlnormalizer-basic" target="test"/> + <ant dir="urlnormalizer-host" target="test"/> + <ant dir="urlnormalizer-pass" target="test"/> + <ant dir="urlnormalizer-protocol" target="test"/> + <ant dir="urlnormalizer-querystring" target="test"/> + <ant dir="urlnormalizer-regex" target="test"/> + <ant dir="urlnormalizer-slash" target="test"/> + </parallel> + </target> + + <!-- ====================================================== --> + <!-- Clean all of the plugins. --> + <!-- ====================================================== --> + <target name="clean"> + <ant dir="creativecommons" target="clean"/> + <ant dir="feed" target="clean"/> + <ant dir="headings" target="clean"/> + <ant dir="index-basic" target="clean"/> + <ant dir="index-anchor" target="clean"/> + <ant dir="index-geoip" target="clean"/> + <ant dir="index-more" target="clean"/> + <ant dir="index-static" target="clean"/> + <ant dir="index-replace" target="clean"/> + <ant dir="index-metadata" target="clean"/> + <ant dir="index-links" target="clean"/> + <ant dir="mimetype-filter" target="clean"/> + <ant dir="indexer-cloudsearch" target="clean"/> + <ant dir="indexer-dummy" target="clean"/> + <ant dir="indexer-elastic" target="clean"/> + <ant dir="indexer-solr" target="clean"/> + <ant dir="language-identifier" target="clean"/> + <!-- <ant dir="lib-commons-httpclient" target="clean"/> --> + <ant dir="lib-http" target="clean"/> + <!-- <ant dir="lib-lucene-analyzers" target="clean"/>--> + <ant dir="lib-nekohtml" target="clean"/> + <ant dir="lib-regex-filter" target="clean"/> + <ant dir="lib-xml" target="clean"/> + <ant dir="microformats-reltag" target="clean"/> + <ant dir="nutch-extensionpoints" target="clean"/> + <ant dir="protocol-file" target="clean"/> + <ant dir="protocol-ftp" target="clean"/> + <ant dir="protocol-http" target="clean"/> + <ant dir="protocol-httpclient" target="clean"/> + <ant dir="lib-htmlunit" target="clean"/> + <ant dir="protocol-htmlunit" target="clean" /> + <ant dir="lib-selenium" target="clean"/> + <ant dir="protocol-selenium" target="clean" /> + <ant dir="protocol-interactiveselenium" target="clean" /> + <ant dir="parse-ext" target="clean"/> + <ant dir="parse-js" target="clean"/> + <ant dir="parse-html" target="clean"/> + <ant dir="parse-metatags" target="clean"/> + <ant dir="parse-swf" target="clean"/> + <ant dir="parse-tika" target="clean"/> + <ant dir="parse-zip" target="clean"/> + <ant dir="parsefilter-regex" target="clean"/> + <ant dir="scoring-depth" target="clean"/> + <ant dir="scoring-opic" target="clean"/> + <ant dir="scoring-link" target="clean"/> + <ant dir="scoring-similarity" target="clean"/> + <ant dir="subcollection" target="clean"/> + <ant dir="tld" target="clean"/> + <ant dir="urlfilter-automaton" target="clean"/> + <ant dir="urlfilter-domain" target="clean" /> + <ant dir="urlfilter-domainblacklist" target="clean" /> + <ant dir="urlfilter-prefix" target="clean"/> + <ant dir="urlfilter-regex" target="clean"/> + <ant dir="urlfilter-suffix" target="clean"/> + <ant dir="urlfilter-validator" target="clean"/> + <ant dir="urlfilter-ignoreexempt" target="clean"/> + <ant dir="parsefilter-naivebayes" target="clean" /> + <ant dir="urlmeta" target="clean"/> + <ant dir="urlnormalizer-ajax" target="clean"/> + <ant dir="urlnormalizer-basic" target="clean"/> + <ant dir="urlnormalizer-host" target="clean"/> + <ant dir="urlnormalizer-pass" target="clean"/> + <ant dir="urlnormalizer-protocol" target="clean"/> + <ant dir="urlnormalizer-querystring" target="clean"/> + <ant dir="urlnormalizer-regex" target="clean"/> + <ant dir="urlnormalizer-slash" target="clean"/> + </target> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/README.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/README.txt b/nutch-plugins/creativecommons/README.txt new file mode 100644 index 0000000..d4d7b65 --- /dev/null +++ b/nutch-plugins/creativecommons/README.txt @@ -0,0 +1 @@ +Support for crawling and searching Creative-Commons licensed content. http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/build.xml b/nutch-plugins/creativecommons/build.xml new file mode 100755 index 0000000..6443d7f --- /dev/null +++ b/nutch-plugins/creativecommons/build.xml @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="creativecommons" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <!-- <ant target="deploy" inheritall="false" dir="../parse-html"/> --> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt b/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt new file mode 100644 index 0000000..324617f --- /dev/null +++ b/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt @@ -0,0 +1,18 @@ +# Creative Commnons crawl filter + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-^(file|ftp|mailto|https): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|rtf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|mp3|rss|xml|doc|pdf|txt|DOC|PDF|TXT)$ + +# skip URLs containing certain characters as probable queries, etc. +-[?*!@=] + +# accept anything else ++. http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/conf/nutch-site.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/conf/nutch-site.xml b/nutch-plugins/creativecommons/conf/nutch-site.xml new file mode 100644 index 0000000..71e344b --- /dev/null +++ b/nutch-plugins/creativecommons/conf/nutch-site.xml @@ -0,0 +1,50 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?> + +<!-- Creative Commons' Nutch configuration --> + +<nutch-conf> + +<property> + <name>http.agent.name</name> + <value>CreativeCommons</value> + <description>Our HTTP 'User-Agent' request header.</description> +</property> + +<property> + <name>http.robots.agents</name> + <value>CreativeCommons,Nutch,*</value> + <description>The agent strings we'll look for in robots.txt files, + comma-separated, in decreasing order of precedence.</description> +</property> + +<property> + <name>fetcher.server.delay</name> + <value>2.0</value> + <description>We need to be more polite than when crawling an + intranet that we control.</description> +</property> + +<property> + <name>http.max.delays</name> + <value>3</value> + <description>The CC crawl visits a large number of different + hosts, so we should not need to delay much.</description> +</property> + +<property> + <name>creativecommons.exclude.unlicensed</name> + <value>true</value> + <description>Exclude HTML content which does not contain a CC license. + </description> +</property> + +<property> + <name>plugin.excludes</name> + <value>parse-(?!html).*</value> + <description>Exclude non-HTML content, since we don't know how to + find a CC license in anything but HTML. + </description> +</property> + +</nutch-conf> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/data/anchor.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/data/anchor.html b/nutch-plugins/creativecommons/data/anchor.html new file mode 100755 index 0000000..90b5227 --- /dev/null +++ b/nutch-plugins/creativecommons/data/anchor.html @@ -0,0 +1,9 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd"> +<html> +<head> +</head> +<body> +<p><a href="http://creativecommons.org/licenses/by-nc-sa/1.0"><img alt="Creative Commons License" src="http://creativecommons.org/images/public/somerights.gif" align="right"></a>This file is licensed under a +<a href="http://creativecommons.org/licenses/by-nc-sa/1.0">Creative Commons License</a>.</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/data/rdf.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/data/rdf.html b/nutch-plugins/creativecommons/data/rdf.html new file mode 100755 index 0000000..fb2c34d --- /dev/null +++ b/nutch-plugins/creativecommons/data/rdf.html @@ -0,0 +1,35 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> + <head> + </head> + <body> + +<!-- Creative Commons License --> +<p><a href="http://creativecommons.org/licenses/by-nc/1.0"><img alt="Creative Commons License" border="0" src="http://creativecommons.org/images/public/somerights.gif" /></a><br /> +This work is licensed under a +<a href="http://creativecommons.org/licenses/by-nc/1.0">Creative Commons License</a>. +<!-- end Creative Commons License --> + + <!-- +<rdf:RDF xmlns="http://web.resource.org/cc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> +<Work rdf:about="http://boingboing.net"> + <dc:type rdf:resource="http://purl.org/dc/dcmitype/Text" /> + <license rdf:resource="http://creativecommons.org/licenses/by-nc/1.0" /> +</Work> + +<License rdf:about="http://creativecommons.org/licenses/by-nc/1.0"> + <requires rdf:resource="http://web.resource.org/cc/Attribution" /> + <permits rdf:resource="http://web.resource.org/cc/DerivativeWorks" /> + <permits rdf:resource="http://web.resource.org/cc/Reproduction" /> + <permits rdf:resource="http://web.resource.org/cc/Distribution" /> + <prohibits rdf:resource="http://web.resource.org/cc/CommercialUse" /> + <requires rdf:resource="http://web.resource.org/cc/Notice" /> +</License> + +</rdf:RDF> + +--> + </body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/data/rel.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/data/rel.html b/nutch-plugins/creativecommons/data/rel.html new file mode 100755 index 0000000..413d52f --- /dev/null +++ b/nutch-plugins/creativecommons/data/rel.html @@ -0,0 +1,6 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head> +</head><body> +<a rel="license" href="http://creativecommons.org/licenses/by-nc/2.0">CC by-nc</a> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/ivy.xml b/nutch-plugins/creativecommons/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/creativecommons/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/plugin.xml b/nutch-plugins/creativecommons/plugin.xml new file mode 100755 index 0000000..de9cf36 --- /dev/null +++ b/nutch-plugins/creativecommons/plugin.xml @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="creativecommons" + name="Creative Commons Plugins" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="creativecommons.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.creativecommons.nutch.CCParseFilter" + name="Creative Commons Metadata Filter" + point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="CCParseFilter" + class="org.creativecommons.nutch.CCParseFilter"/> + </extension> + + <extension id="org.creativecommons.nutch.CCIndexingFilter" + name="Creative Commons Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="CCIndexingFilter" + class="org.creativecommons.nutch.CCIndexingFilter"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/pom.xml b/nutch-plugins/creativecommons/pom.xml new file mode 100644 index 0000000..7eb7564 --- /dev/null +++ b/nutch-plugins/creativecommons/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>creativecommons</artifactId> + <packaging>jar</packaging> + + <name>creativecommons</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project>
