http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/WritableTestUtils.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/java/org/apache/nutch/util/WritableTestUtils.java b/nutch-core/src/test/java/org/apache/nutch/util/WritableTestUtils.java new file mode 100644 index 0000000..49bcfa9 --- /dev/null +++ b/nutch-core/src/test/java/org/apache/nutch/util/WritableTestUtils.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import org.apache.hadoop.io.*; +import org.apache.hadoop.conf.*; +import org.junit.Assert; + +public class WritableTestUtils { + + /** Utility method for testing writables. */ + public static void testWritable(Writable before) throws Exception { + testWritable(before, null); + } + + /** Utility method for testing writables. */ + public static void testWritable(Writable before, Configuration conf) + throws Exception { + Assert.assertEquals(before, writeRead(before, conf)); + } + + /** Utility method for testing writables. */ + public static Writable writeRead(Writable before, Configuration conf) + throws Exception { + + DataOutputBuffer dob = new DataOutputBuffer(); + before.write(dob); + + DataInputBuffer dib = new DataInputBuffer(); + dib.reset(dob.getData(), dob.getLength()); + + Writable after = (Writable) before.getClass().newInstance(); + if (conf != null) { + ((Configurable) after).setConf(conf); + } + after.readFields(dib); + return after; + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/crawl-tests.xml ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/crawl-tests.xml b/nutch-core/src/test/resources/crawl-tests.xml new file mode 100644 index 0000000..01fc683 --- /dev/null +++ b/nutch-core/src/test/resources/crawl-tests.xml @@ -0,0 +1,62 @@ +<?xml version="1.0"?> + +<!-- Configuration overrides used during unit tests. --> + +<configuration> + +<property> + <name>plugin.includes</name> + <value>parse-tika|protocol-http|urlfilter-suffix|scoring-opic</value> + <description>Enable required plugins.</description> +</property> + +<property> + <name>content.server.port</name> + <value>55000</value> + <description>Port of http server serving content.</description> +</property> + +<property> + <name>fetcher.server.delay</name> + <value>0.2</value> + <description>The number of seconds the fetcher will delay between + successive requests to the same server.</description> +</property> + +<property> + <name>http.agent.name</name> + <value>test-nutch</value> +</property> + +<property> + <name>http.robots.agents</name> + <value>test-nutch,*</value> +</property> + +<property> + <name>http.agent.name.check</name> + <value>true</value> +</property> + +<property> + <name>http.robots.agents</name> + <value>test-nutch,*</value> + <description>The agent strings we'll look for in robots.txt files, + comma-separated, in decreasing order of precedence. You should + put the value of http.agent.name as the first agent name, and keep the + default * at the end of the list. E.g.: BlurflDev,Blurfl,* + </description> +</property> + +<property> + <name>io.serializations</name> + <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value> + <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization, + org.apache.hadoop.io.serializer.avro.AvroReflectSerialization, + org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, --> + <description>A list of serialization classes that can be used for + obtaining serializers and deserializers.</description> +</property> + +</configuration> + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/domain-urlfilter.txt ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/domain-urlfilter.txt b/nutch-core/src/test/resources/domain-urlfilter.txt new file mode 100644 index 0000000..955700a --- /dev/null +++ b/nutch-core/src/test/resources/domain-urlfilter.txt @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# config file for urlfilter-domsin plugin + +com +org +net +edu +gov http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html b/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html new file mode 100644 index 0000000..6444c41 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html @@ -0,0 +1,11 @@ +<html> + <head> + <title>page a</title> + </head> +<body> +This is page a +<a href="index.html">home</a> +<hr> +Nutch fetcher test page +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/exception.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/exception.html b/nutch-core/src/test/resources/fetch-test-site/exception.html new file mode 100644 index 0000000..e1192a1 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/exception.html @@ -0,0 +1,13 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> +<HTML> +<HEAD> +<TITLE>Exception</TITLE> +<META http-equiv="Content-Type" content="text/html; charset=unicode"> +</HEAD> +<BODY> +!!Trying to parse this one will fail with a MalformedInputException!! + +Nutch fetcher test page. +</BODY> +</HTML> + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/index.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/index.html b/nutch-core/src/test/resources/fetch-test-site/index.html new file mode 100644 index 0000000..d73ff3f --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/index.html @@ -0,0 +1,13 @@ +<html> + <head> + <title>front page</title> + </head> +<body> +This is front page. +<a href="pagea.html">Page a</a> +<a href="pageb.html">Page b</a> +<a href="dup_of_pagea.html">dup of Page a</a> +<hr> +Nutch fetcher test page +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html b/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html new file mode 100644 index 0000000..5dcf7c2 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html @@ -0,0 +1,23 @@ +<html> +<head> +<title>nested spider trap</title> +</head> + +<body>Nutch fetcher test page +<table> + <tr> + <td> +<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> +<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> +<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i ></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> +<i><b><i><b><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></ b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></b></i></b></i> +<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i>< b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b ><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b> <i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b>< i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i ><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i> <b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i>< b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>< /i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></ b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i ></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i>< /b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></ i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b ></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> </b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>< /i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> +</b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> +<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b>< i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> + + </td> + </tr> + +</table> +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/pagea.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/pagea.html b/nutch-core/src/test/resources/fetch-test-site/pagea.html new file mode 100644 index 0000000..6444c41 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/pagea.html @@ -0,0 +1,11 @@ +<html> + <head> + <title>page a</title> + </head> +<body> +This is page a +<a href="index.html">home</a> +<hr> +Nutch fetcher test page +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/pageb.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/pageb.html b/nutch-core/src/test/resources/fetch-test-site/pageb.html new file mode 100644 index 0000000..66e3725 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/pageb.html @@ -0,0 +1,11 @@ +<html> + <head> + <title>bage b</title> + </head> +<body> +This is page b +<a href="index.html">home</a> +<hr> +Nutch fetcher test page +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/robots.txt ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/robots.txt b/nutch-core/src/test/resources/fetch-test-site/robots.txt new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/filter-all.txt ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/filter-all.txt b/nutch-core/src/test/resources/filter-all.txt new file mode 100644 index 0000000..4ed567a --- /dev/null +++ b/nutch-core/src/test/resources/filter-all.txt @@ -0,0 +1,7 @@ +# Config file for urlfilter-suffix plugin +# Filter away all urls + +# case-insensitive, disallow unknown suffixes +-I + +# allow these http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/log4j.properties ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/log4j.properties b/nutch-core/src/test/resources/log4j.properties new file mode 100644 index 0000000..3ff115f --- /dev/null +++ b/nutch-core/src/test/resources/log4j.properties @@ -0,0 +1,7 @@ +# log4j configuration used during build and unit tests + +log4j.rootLogger=info,stdout +log4j.threshold=ALL +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/nutch-site.xml ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/nutch-site.xml b/nutch-core/src/test/resources/nutch-site.xml new file mode 100644 index 0000000..dd40873 --- /dev/null +++ b/nutch-core/src/test/resources/nutch-site.xml @@ -0,0 +1,19 @@ +<?xml version="1.0"?> + +<!-- Configuration overrides used during unit tests. --> + +<configuration> + +<property> + <name>plugin.includes</name> + <value>.*</value> + <description>Enable all plugins during unit testing.</description> +</property> + +<property> + <name>distributed.search.test.port</name> + <value>60000</value> + <description>TCP port used during junit testing.</description> +</property> + +</configuration> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-mime-util/test.xlsx ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-mime-util/test.xlsx b/nutch-core/src/test/resources/test-mime-util/test.xlsx new file mode 100644 index 0000000..de33f28 Binary files /dev/null and b/nutch-core/src/test/resources/test-mime-util/test.xlsx differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc new file mode 100644 index 0000000..c321777 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc new file mode 100644 index 0000000..5c5d11f Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data new file mode 100644 index 0000000..0f8d263 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index new file mode 100644 index 0000000..4dfeaec Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc new file mode 100644 index 0000000..c4d315a Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc new file mode 100644 index 0000000..6dd171e Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data new file mode 100644 index 0000000..66b1f8d Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index new file mode 100644 index 0000000..ad4ed47 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc new file mode 100644 index 0000000..8d5ffa4 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 new file mode 100644 index 0000000..41ef146 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc new file mode 100644 index 0000000..683a1dd Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 new file mode 100644 index 0000000..3232abf Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc new file mode 100644 index 0000000..47164ee Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc new file mode 100644 index 0000000..a32d62d Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data new file mode 100644 index 0000000..5b71a24 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index new file mode 100644 index 0000000..d931103 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc new file mode 100644 index 0000000..53c925c Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc new file mode 100644 index 0000000..5ba878c Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data new file mode 100644 index 0000000..b58f97f Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index new file mode 100644 index 0000000..9880a27 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc new file mode 100644 index 0000000..1b49819 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc new file mode 100644 index 0000000..5aae648 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data new file mode 100644 index 0000000..8069e84 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index new file mode 100644 index 0000000..9b19ce9 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc new file mode 100644 index 0000000..926ced1 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc new file mode 100644 index 0000000..714a1e8 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data new file mode 100644 index 0000000..f36a9fa Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index new file mode 100644 index 0000000..c648d89 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc new file mode 100644 index 0000000..3ee3c94 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 new file mode 100644 index 0000000..1ef0406 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc new file mode 100644 index 0000000..7948825 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 new file mode 100644 index 0000000..3a83a82 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc new file mode 100644 index 0000000..b46b6f6 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc new file mode 100644 index 0000000..18766e6 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data new file mode 100644 index 0000000..9a1f284 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index new file mode 100644 index 0000000..47fb983 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc new file mode 100644 index 0000000..ceada1b Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc new file mode 100644 index 0000000..b756b5c Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data new file mode 100644 index 0000000..ad96df0 Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index new file mode 100644 index 0000000..a3e1d8d Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index differ http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/build-plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/build-plugin.xml b/nutch-plugins/build-plugin.xml new file mode 100755 index 0000000..c759d5f --- /dev/null +++ b/nutch-plugins/build-plugin.xml @@ -0,0 +1,255 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- Imported by plugin build.xml files to define default targets. --> +<project xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="name" value="${ant.project.name}"/> + <property name="root" value="${basedir}"/> + + <!-- load plugin-specific properties first --> + <property file="${user.home}/${name}.build.properties" /> + <property file="${root}/build.properties" /> + + <property name="nutch.root" location="${root}/../../../"/> + + <property name="src.dir" location="${root}/src/java"/> + <property name="src.test" location="${root}/src/test"/> + + <available file="${src.test}" type="dir" property="test.available"/> + + <property name="conf.dir" location="${nutch.root}/conf"/> + + <property name="build.dir" location="${nutch.root}/build/${name}"/> + <property name="build.classes" location="${build.dir}/classes"/> + <property name="build.test" location="${build.dir}/test"/> + <property name="build.test.lib" location="${build.test}/lib"/> + + <property name="deploy.dir" location="${nutch.root}/build/plugins/${name}"/> + + <!-- load nutch defaults last so that they can be overridden above --> + <property file="${nutch.root}/default.properties" /> + + <ivy:settings id="ivy.instance" file="${nutch.root}/ivy/ivysettings.xml" /> + + <path id="plugin.deps"/> + + <fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/> + + <!-- the normal classpath --> + <path id="classpath"> + <pathelement location="${build.classes}"/> + <fileset refid="lib.jars"/> + <pathelement location="${nutch.root}/build/classes"/> + <fileset dir="${nutch.root}/build/lib"> + <include name="*.jar" /> + </fileset> + <path refid="plugin.deps"/> + <fileset dir="${deploy.dir}"> + <include name="*.jar" /> + </fileset> + </path> + + <!-- the unit test classpath --> + <path id="test.classpath"> + <pathelement location="${build.test}" /> + <pathelement location="${nutch.root}/build/test/classes"/> + <pathelement location="${nutch.root}/src/test"/> + <pathelement location="${conf.dir}"/> + <pathelement location="${nutch.root}/build"/> + <!-- test dependencies specific to current plugin --> + <fileset dir="${build.test.lib}"> + <include name="*.jar" /> + </fileset> + <!-- global test dependencies --> + <fileset dir="${nutch.root}/build/test/lib"> + <include name="*.jar" /> + </fileset> + <path refid="classpath"/> + </path> + + <!-- ====================================================== --> + <!-- Stuff needed by all targets --> + <!-- ====================================================== --> + <target name="init"> + <mkdir dir="${build.dir}"/> + <mkdir dir="${build.classes}"/> + <mkdir dir="${build.test}"/> + <mkdir dir="${build.test.lib}"/> + <mkdir dir="${deploy.dir}"/> + + <antcall target="init-plugin"/> + </target> + + <!-- to be overridden by sub-projects --> + <target name="init-plugin"/> + + <!-- + ! Used to build plugin compilation dependencies + ! (to be overridden by plugins) + !--> + <target name="deps-jar"/> + + <!-- + ! Used to deploy plugin runtime dependencies + ! (to be overridden by plugins) + !--> + <target name="deps-test"/> + + <!-- + ! Used to compile test for plugin runtime dependencies + ! (to be overridden by plugins) + !--> + <target name="deps-test-compile"/> + + <!-- ====================================================== --> + <!-- Compile the Java files --> + <!-- ====================================================== --> + <target name="compile" depends="init,deps-jar, resolve-default"> + <echo message="Compiling plugin: ${name}"/> + <javac + encoding="${build.encoding}" + srcdir="${src.dir}" + includes="**/*.java" + destdir="${build.classes}" + debug="${javac.debug}" + optimize="${javac.optimize}" + target="${javac.version}" + source="${javac.version}" + deprecation="${javac.deprecation}"> + <classpath refid="classpath"/> + </javac> + </target> + + <target name="compile-core"> + <ant target="compile-core" inheritall="false" dir="${nutch.root}"/> + <ant target="compile"/> + </target> + + <!-- ================================================================== --> + <!-- Make plugin .jar --> + <!-- ================================================================== --> + <!-- --> + <!-- ================================================================== --> + <target name="jar" depends="compile"> + <jar + jarfile="${build.dir}/${name}.jar" + basedir="${build.classes}" + /> + </target> + + <target name="jar-core" depends="compile-core"> + <jar + jarfile="${build.dir}/${name}.jar" + basedir="${build.classes}" + /> + </target> + + <!-- ================================================================== --> + <!-- Deploy plugin to ${deploy.dir} --> + <!-- ================================================================== --> + <!-- --> + <!-- ================================================================== --> + <target name="deploy" depends="jar, deps-test"> + <mkdir dir="${deploy.dir}"/> + <copy file="plugin.xml" todir="${deploy.dir}" + preservelastmodified="true"/> + <available property="lib-available" + file="${build.dir}/${name}.jar"/> + <antcall target="copy-generated-lib"/> + <copy todir="${deploy.dir}" flatten="true"> + <fileset refid="lib.jars"/> + </copy> + </target> + + <target name="copy-generated-lib" if="lib-available"> + <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" failonerror="false"/> + </target> + + <!-- ================================================================== --> + <!-- Compile test code --> + <!-- ================================================================== --> + <target name="compile-test" depends="compile, deps-test-compile" if="test.available"> + <javac + encoding="${build.encoding}" + srcdir="${src.test}" + includes="**/*.java" + destdir="${build.test}" + debug="${javac.debug}" + optimize="${javac.optimize}" + target="${javac.version}" + source="${javac.version}" + deprecation="${javac.deprecation}"> + <classpath refid="test.classpath"/> + </javac> + </target> + + <!-- ================================================================== --> + <!-- Run unit tests --> + <!-- ================================================================== --> + <target name="test" depends="compile-test, deploy" if="test.available"> + <echo message="Testing plugin: ${name}"/> + + <junit printsummary="yes" haltonfailure="no" fork="yes" + errorProperty="tests.failed" failureProperty="tests.failed"> + <sysproperty key="test.data" value="${build.test}/data"/> + <sysproperty key="test.input" value="${root}/data"/> + <sysproperty key="javax.xml.parsers.DocumentBuilderFactory" value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/> + <classpath refid="test.classpath"/> + <formatter type="${test.junit.output.format}" /> + <batchtest todir="${build.test}" unless="testcase"> + <fileset dir="${src.test}" + includes="**/Test*.java" excludes="**/${test.exclude}.java" /> + </batchtest> + <batchtest todir="${build.test}" if="testcase"> + <fileset dir="${src.test}" includes="**/${testcase}.java"/> + </batchtest> + </junit> + + <fail if="tests.failed">Tests failed!</fail> + + </target> + + <!-- target: resolve ================================================= --> + <target name="resolve-default" depends="clean-lib" description="resolve and retrieve dependencies with ivy"> + <ivy:resolve file="ivy.xml" conf="default" log="download-only"/> + <ivy:retrieve pattern="${deploy.dir}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/> + </target> + + <target name="resolve-test" depends="clean-lib" description="resolve and retrieve dependencies with ivy"> + <ivy:resolve file="ivy.xml" conf="test" log="download-only"/> + <ivy:retrieve pattern="${build.test.lib}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/> + </target> + + <!-- ================================================================== --> + <!-- Clean. Delete the build files, and their directories --> + <!-- ================================================================== --> + <!-- target: clean =================================================== --> + <target name="clean" depends="clean-build, clean-lib" description="--> clean the project" /> + + <!-- target: clean-lib =============================================== --> + <target name="clean-lib" description="--> clean the project libraries directory (dependencies)"> + <delete includeemptydirs="true" dir="${build.lib.dir}"/> + </target> + + <!-- target: clean-build ============================================= --> + <target name="clean-build" description="--> clean the project built files"> + <delete includeemptydirs="true" dir="${build.dir}"/> + <delete includeemptydirs="true" dir="${deploy.dir}"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/build.xml b/nutch-plugins/build.xml new file mode 100755 index 0000000..75ae2e7 --- /dev/null +++ b/nutch-plugins/build.xml @@ -0,0 +1,213 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="Nutch" default="deploy-core" basedir="."> + + <target name="deploy-core"> + <ant target="compile-core" inheritall="false" dir="../.."/> + <ant target="deploy"/> + </target> + + <!-- ====================================================== --> + <!-- Build & deploy all the plugin jars. --> + <!-- ====================================================== --> + <target name="deploy"> + <ant dir="creativecommons" target="deploy"/> + <ant dir="feed" target="deploy"/> + <ant dir="headings" target="deploy"/> + <ant dir="index-basic" target="deploy"/> + <ant dir="index-anchor" target="deploy"/> + <ant dir="index-geoip" target="deploy"/> + <ant dir="index-more" target="deploy"/> + <ant dir="index-replace" target="deploy"/> + <ant dir="index-static" target="deploy"/> + <ant dir="index-metadata" target="deploy"/> + <ant dir="index-links" target="deploy"/> + <ant dir="mimetype-filter" target="deploy"/> + <ant dir="indexer-cloudsearch" target="deploy"/> + <ant dir="indexer-dummy" target="deploy"/> + <ant dir="indexer-elastic" target="deploy"/> + <ant dir="indexer-solr" target="deploy"/> + <ant dir="language-identifier" target="deploy"/> + <ant dir="lib-http" target="deploy"/> + <ant dir="lib-nekohtml" target="deploy"/> + <ant dir="lib-regex-filter" target="deploy"/> + <ant dir="lib-xml" target="deploy"/> + <ant dir="microformats-reltag" target="deploy"/> + <ant dir="nutch-extensionpoints" target="deploy"/> + <ant dir="protocol-file" target="deploy"/> + <ant dir="protocol-ftp" target="deploy"/> + <ant dir="protocol-http" target="deploy"/> + <ant dir="protocol-httpclient" target="deploy"/> + <ant dir="lib-htmlunit" target="deploy"/> + <ant dir="protocol-htmlunit" target="deploy" /> + <ant dir="lib-selenium" target="deploy"/> + <ant dir="protocol-selenium" target="deploy" /> + <ant dir="protocol-interactiveselenium" target="deploy" /> + <ant dir="parse-ext" target="deploy"/> + <ant dir="parse-js" target="deploy"/> + <ant dir="parse-html" target="deploy"/> + <ant dir="parse-metatags" target="deploy"/> + <ant dir="parse-swf" target="deploy"/> + <ant dir="parse-tika" target="deploy"/> + <ant dir="parse-zip" target="deploy"/> + <ant dir="scoring-depth" target="deploy"/> + <ant dir="scoring-opic" target="deploy"/> + <ant dir="scoring-link" target="deploy"/> + <ant dir="scoring-similarity" target="deploy"/> + <ant dir="subcollection" target="deploy"/> + <ant dir="tld" target="deploy"/> + <ant dir="urlfilter-automaton" target="deploy"/> + <ant dir="urlfilter-domain" target="deploy" /> + <ant dir="urlfilter-domainblacklist" target="deploy" /> + <ant dir="urlfilter-prefix" target="deploy"/> + <ant dir="urlfilter-regex" target="deploy"/> + <ant dir="urlfilter-suffix" target="deploy"/> + <ant dir="urlfilter-validator" target="deploy"/> + <ant dir="urlfilter-ignoreexempt" target="deploy"/> + <ant dir="parsefilter-naivebayes" target="deploy"/> + <ant dir="parsefilter-regex" target="deploy"/> + <ant dir="urlmeta" target="deploy"/> + <ant dir="urlnormalizer-ajax" target="deploy"/> + <ant dir="urlnormalizer-basic" target="deploy"/> + <ant dir="urlnormalizer-host" target="deploy"/> + <ant dir="urlnormalizer-pass" target="deploy"/> + <ant dir="urlnormalizer-protocol" target="deploy"/> + <ant dir="urlnormalizer-querystring" target="deploy"/> + <ant dir="urlnormalizer-regex" target="deploy"/> + <ant dir="urlnormalizer-slash" target="deploy"/> + </target> + + <!-- ====================================================== --> + <!-- Test all of the plugins. --> + <!-- ====================================================== --> + <target name="test"> + <parallel threadCount="2"> + <ant dir="creativecommons" target="test"/> + <ant dir="index-basic" target="test"/> + <ant dir="index-anchor" target="test"/> + <ant dir="index-geoip" target="test"/> + <ant dir="index-more" target="test"/> + <ant dir="index-static" target="test"/> + <ant dir="index-replace" target="test"/> + <ant dir="index-links" target="test"/> + <ant dir="mimetype-filter" target="test"/> + <ant dir="language-identifier" target="test"/> + <ant dir="lib-http" target="test"/> + <ant dir="protocol-file" target="test"/> + <ant dir="protocol-http" target="test"/> + <ant dir="protocol-httpclient" target="test"/> + <!--ant dir="parse-ext" target="test"/--> + <ant dir="feed" target="test"/> + <ant dir="parse-html" target="test"/> + <ant dir="parse-metatags" target="test"/> + <ant dir="parse-swf" target="test"/> + <ant dir="parse-tika" target="test"/> + <ant dir="parse-zip" target="test"/> + <ant dir="parsefilter-regex" target="test"/> + <ant dir="subcollection" target="test"/> + <ant dir="urlfilter-automaton" target="test"/> + <ant dir="urlfilter-domain" target="test"/> + <ant dir="urlfilter-domainblacklist" target="test"/> + <ant dir="urlfilter-prefix" target="test"/> + <ant dir="urlfilter-regex" target="test"/> + <ant dir="urlfilter-suffix" target="test"/> + <ant dir="urlfilter-validator" target="test"/> + <ant dir="urlfilter-ignoreexempt" target="test"/> + <ant dir="urlnormalizer-ajax" target="test"/> + <ant dir="urlnormalizer-basic" target="test"/> + <ant dir="urlnormalizer-host" target="test"/> + <ant dir="urlnormalizer-pass" target="test"/> + <ant dir="urlnormalizer-protocol" target="test"/> + <ant dir="urlnormalizer-querystring" target="test"/> + <ant dir="urlnormalizer-regex" target="test"/> + <ant dir="urlnormalizer-slash" target="test"/> + </parallel> + </target> + + <!-- ====================================================== --> + <!-- Clean all of the plugins. --> + <!-- ====================================================== --> + <target name="clean"> + <ant dir="creativecommons" target="clean"/> + <ant dir="feed" target="clean"/> + <ant dir="headings" target="clean"/> + <ant dir="index-basic" target="clean"/> + <ant dir="index-anchor" target="clean"/> + <ant dir="index-geoip" target="clean"/> + <ant dir="index-more" target="clean"/> + <ant dir="index-static" target="clean"/> + <ant dir="index-replace" target="clean"/> + <ant dir="index-metadata" target="clean"/> + <ant dir="index-links" target="clean"/> + <ant dir="mimetype-filter" target="clean"/> + <ant dir="indexer-cloudsearch" target="clean"/> + <ant dir="indexer-dummy" target="clean"/> + <ant dir="indexer-elastic" target="clean"/> + <ant dir="indexer-solr" target="clean"/> + <ant dir="language-identifier" target="clean"/> + <!-- <ant dir="lib-commons-httpclient" target="clean"/> --> + <ant dir="lib-http" target="clean"/> + <!-- <ant dir="lib-lucene-analyzers" target="clean"/>--> + <ant dir="lib-nekohtml" target="clean"/> + <ant dir="lib-regex-filter" target="clean"/> + <ant dir="lib-xml" target="clean"/> + <ant dir="microformats-reltag" target="clean"/> + <ant dir="nutch-extensionpoints" target="clean"/> + <ant dir="protocol-file" target="clean"/> + <ant dir="protocol-ftp" target="clean"/> + <ant dir="protocol-http" target="clean"/> + <ant dir="protocol-httpclient" target="clean"/> + <ant dir="lib-htmlunit" target="clean"/> + <ant dir="protocol-htmlunit" target="clean" /> + <ant dir="lib-selenium" target="clean"/> + <ant dir="protocol-selenium" target="clean" /> + <ant dir="protocol-interactiveselenium" target="clean" /> + <ant dir="parse-ext" target="clean"/> + <ant dir="parse-js" target="clean"/> + <ant dir="parse-html" target="clean"/> + <ant dir="parse-metatags" target="clean"/> + <ant dir="parse-swf" target="clean"/> + <ant dir="parse-tika" target="clean"/> + <ant dir="parse-zip" target="clean"/> + <ant dir="parsefilter-regex" target="clean"/> + <ant dir="scoring-depth" target="clean"/> + <ant dir="scoring-opic" target="clean"/> + <ant dir="scoring-link" target="clean"/> + <ant dir="scoring-similarity" target="clean"/> + <ant dir="subcollection" target="clean"/> + <ant dir="tld" target="clean"/> + <ant dir="urlfilter-automaton" target="clean"/> + <ant dir="urlfilter-domain" target="clean" /> + <ant dir="urlfilter-domainblacklist" target="clean" /> + <ant dir="urlfilter-prefix" target="clean"/> + <ant dir="urlfilter-regex" target="clean"/> + <ant dir="urlfilter-suffix" target="clean"/> + <ant dir="urlfilter-validator" target="clean"/> + <ant dir="urlfilter-ignoreexempt" target="clean"/> + <ant dir="parsefilter-naivebayes" target="clean" /> + <ant dir="urlmeta" target="clean"/> + <ant dir="urlnormalizer-ajax" target="clean"/> + <ant dir="urlnormalizer-basic" target="clean"/> + <ant dir="urlnormalizer-host" target="clean"/> + <ant dir="urlnormalizer-pass" target="clean"/> + <ant dir="urlnormalizer-protocol" target="clean"/> + <ant dir="urlnormalizer-querystring" target="clean"/> + <ant dir="urlnormalizer-regex" target="clean"/> + <ant dir="urlnormalizer-slash" target="clean"/> + </target> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/README.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/README.txt b/nutch-plugins/creativecommons/README.txt new file mode 100644 index 0000000..d4d7b65 --- /dev/null +++ b/nutch-plugins/creativecommons/README.txt @@ -0,0 +1 @@ +Support for crawling and searching Creative-Commons licensed content. http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/build.xml b/nutch-plugins/creativecommons/build.xml new file mode 100755 index 0000000..6443d7f --- /dev/null +++ b/nutch-plugins/creativecommons/build.xml @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="creativecommons" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <!-- <ant target="deploy" inheritall="false" dir="../parse-html"/> --> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt b/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt new file mode 100644 index 0000000..324617f --- /dev/null +++ b/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt @@ -0,0 +1,18 @@ +# Creative Commnons crawl filter + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-^(file|ftp|mailto|https): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|rtf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|mp3|rss|xml|doc|pdf|txt|DOC|PDF|TXT)$ + +# skip URLs containing certain characters as probable queries, etc. +-[?*!@=] + +# accept anything else ++. http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/conf/nutch-site.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/conf/nutch-site.xml b/nutch-plugins/creativecommons/conf/nutch-site.xml new file mode 100644 index 0000000..71e344b --- /dev/null +++ b/nutch-plugins/creativecommons/conf/nutch-site.xml @@ -0,0 +1,50 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?> + +<!-- Creative Commons' Nutch configuration --> + +<nutch-conf> + +<property> + <name>http.agent.name</name> + <value>CreativeCommons</value> + <description>Our HTTP 'User-Agent' request header.</description> +</property> + +<property> + <name>http.robots.agents</name> + <value>CreativeCommons,Nutch,*</value> + <description>The agent strings we'll look for in robots.txt files, + comma-separated, in decreasing order of precedence.</description> +</property> + +<property> + <name>fetcher.server.delay</name> + <value>2.0</value> + <description>We need to be more polite than when crawling an + intranet that we control.</description> +</property> + +<property> + <name>http.max.delays</name> + <value>3</value> + <description>The CC crawl visits a large number of different + hosts, so we should not need to delay much.</description> +</property> + +<property> + <name>creativecommons.exclude.unlicensed</name> + <value>true</value> + <description>Exclude HTML content which does not contain a CC license. + </description> +</property> + +<property> + <name>plugin.excludes</name> + <value>parse-(?!html).*</value> + <description>Exclude non-HTML content, since we don't know how to + find a CC license in anything but HTML. + </description> +</property> + +</nutch-conf> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/ivy.xml b/nutch-plugins/creativecommons/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/creativecommons/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module>
